From 6525dd47276939f87c9479f5e16ba0d9f036d1b7 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 18 Jan 2023 16:59:21 +0400 Subject: [PATCH] [Snippets][CPU] Added FP32 MHA tokenization support (#14327) --- src/common/snippets/CMakeLists.txt | 5 +- .../snippets/include/snippets/generator.hpp | 30 +- .../snippets/include/snippets/op/brgemm.hpp | 47 + .../include/snippets/op/broadcastload.hpp | 10 +- .../include/snippets/op/broadcastmove.hpp | 7 +- .../snippets/include/snippets/op/buffer.hpp | 47 + .../snippets/include/snippets/op/fill.hpp | 47 + .../include/snippets/op/horizon_max.hpp | 32 + .../include/snippets/op/horizon_sum.hpp | 32 + .../snippets/include/snippets/op/kernel.hpp | 7 +- .../snippets/include/snippets/op/load.hpp | 46 +- .../snippets/include/snippets/op/loop.hpp | 111 ++ .../include/snippets/op/memory_access.hpp | 42 + .../include/snippets/op/powerstatic.hpp | 1 - .../snippets/include/snippets/op/scalar.hpp | 2 +- .../snippets/include/snippets/op/store.hpp | 26 +- .../snippets/include/snippets/op/subgraph.hpp | 114 +- .../snippets/include/snippets/op/tile.hpp | 48 - .../include/snippets/op/tile_scheduler.hpp | 39 - .../include/snippets/op/vector_buffer.hpp | 34 + .../pass/broadcast_to_movebroadcast.hpp | 28 + .../snippets/pass/collapse_subgraph.hpp | 26 +- .../pass/explicit_transpose_matmul_inputs.hpp | 32 + .../snippets/pass/fuse_transpose_brgemm.hpp | 30 + .../include/snippets/pass/insert_buffer.hpp | 30 + .../snippets/pass/insert_load_store.hpp | 4 +- .../include/snippets/pass/insert_loops.hpp | 43 + .../snippets/pass/insert_movebroadcast.hpp | 4 + .../include/snippets/pass/loop_fusion.hpp | 29 + .../include/snippets/pass/loop_helpers.hpp | 99 ++ .../snippets/pass/matmul_to_brgemm.hpp | 28 + .../snippets/pass/mha_tokenization.hpp | 28 + .../include/snippets/pass/reset_buffer.hpp | 29 + .../snippets/pass/softmax_decomposition.hpp | 30 + .../pass/softmax_reshape_elimination.hpp | 27 + .../include/snippets/pass/tokenization.hpp | 58 + .../snippets/pass/transpose_decomposition.hpp | 28 + .../include/snippets/snippets_isa.hpp | 9 +- .../include/snippets/snippets_isa_tbl.hpp | 4 + .../snippets/include/snippets/utils.hpp | 10 + src/common/snippets/src/generator.cpp | 249 ++-- src/common/snippets/src/op/brgemm.cpp | 64 + src/common/snippets/src/op/broadcastload.cpp | 12 +- src/common/snippets/src/op/broadcastmove.cpp | 41 +- src/common/snippets/src/op/buffer.cpp | 53 + src/common/snippets/src/op/fill.cpp | 38 + src/common/snippets/src/op/horizon_max.cpp | 28 + src/common/snippets/src/op/horizon_sum.cpp | 28 + src/common/snippets/src/op/kernel.cpp | 12 +- src/common/snippets/src/op/load.cpp | 61 +- src/common/snippets/src/op/loop.cpp | 182 +++ src/common/snippets/src/op/memory_access.cpp | 45 + src/common/snippets/src/op/powerstatic.cpp | 15 - src/common/snippets/src/op/scalar.cpp | 12 +- src/common/snippets/src/op/store.cpp | 38 +- src/common/snippets/src/op/subgraph.cpp | 421 +++++-- src/common/snippets/src/op/tile.cpp | 15 - src/common/snippets/src/op/tile_scheduler.cpp | 10 - src/common/snippets/src/op/vector_buffer.cpp | 27 + .../snippets/src/pass/align_element_type.cpp | 15 +- .../snippets/src/pass/assign_registers.cpp | 368 ++++-- .../src/pass/broadcast_to_movebroadcast.cpp | 49 + .../snippets/src/pass/collapse_subgraph.cpp | 218 ++-- .../src/pass/common_optimizations.cpp | 11 +- .../snippets/src/pass/convert_constants.cpp | 9 +- .../pass/explicit_transpose_matmul_inputs.cpp | 83 ++ .../src/pass/fuse_transpose_brgemm.cpp | 86 ++ .../snippets/src/pass/insert_buffer.cpp | 96 ++ .../snippets/src/pass/insert_load_store.cpp | 32 +- src/common/snippets/src/pass/insert_loops.cpp | 285 +++++ .../src/pass/insert_movebroadcast.cpp | 86 +- .../load_movebroadcast_to_broadcastload.cpp | 6 +- src/common/snippets/src/pass/loop_fusion.cpp | 331 +++++ src/common/snippets/src/pass/loop_helpers.cpp | 48 + .../snippets/src/pass/matmul_to_brgemm.cpp | 45 + .../snippets/src/pass/mha_tokenization.cpp | 394 ++++++ src/common/snippets/src/pass/reset_buffer.cpp | 114 ++ .../src/pass/softmax_decomposition.cpp | 216 ++++ .../src/pass/softmax_reshape_elimination.cpp | 70 ++ src/common/snippets/src/pass/tokenization.cpp | 72 ++ .../src/pass/transpose_decomposition.cpp | 81 ++ src/common/snippets/src/utils.cpp | 56 +- .../snippets/tests/include/lowering_utils.hpp | 8 +- .../pass/broadcast_to_movebroadcast.hpp | 29 + .../include/pass/fuse_transpose_brgemm.hpp | 33 + .../tests/include/pass/mha_tokenization.hpp | 20 + .../include/pass/softmax_decomposition.hpp | 43 + .../snippets/tests/src/lowering_utils.cpp | 63 +- .../src/pass/broadcast_to_movebroadcast.cpp | 59 + .../tests/src/pass/canonicalization.cpp | 11 +- .../tests/src/pass/collapse_subgraph.cpp | 25 +- .../tests/src/pass/fuse_transpose_brgemm.cpp | 58 + .../tests/src/pass/insert_load_store.cpp | 8 +- .../tests/src/pass/insert_movebroadcast.cpp | 13 +- .../snippets/tests/src/pass/merge_loops.cpp | 169 +++ .../tests/src/pass/mha_tokenization.cpp | 38 + .../tests/src/pass/softmax_decomposition.cpp | 122 ++ .../src/pass/softmax_reshape_elimination.cpp | 70 ++ src/common/snippets/tests/src/registers.cpp | 36 +- .../tests/onnx_import_com_microsoft.in.cpp | 2 +- .../interface/ie_internal_plugin_config.hpp | 13 + src/plugins/intel_cpu/src/config.cpp | 10 + src/plugins/intel_cpu/src/config.h | 7 + .../intel_cpu/src/emitters/cpu_generator.cpp | 20 +- .../src/emitters/jit_eltwise_emitters.cpp | 61 + .../src/emitters/jit_eltwise_emitters.hpp | 18 + .../src/emitters/jit_snippets_emitters.cpp | 1088 +++++++++++++---- .../src/emitters/jit_snippets_emitters.hpp | 264 +++- src/plugins/intel_cpu/src/extension.cpp | 11 +- .../snippets_mark_skipped.cpp | 24 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 583 ++++----- src/plugins/intel_cpu/src/nodes/subgraph.h | 42 +- src/plugins/intel_cpu/src/plugin.cpp | 31 +- .../fuse_load_store_and_convert.cpp | 8 +- .../op/load_convert.cpp | 14 +- .../op/load_convert.hpp | 4 +- .../op/store_convert.cpp | 14 +- .../op/store_convert.hpp | 4 +- .../intel_cpu/src/transformation_pipeline.cpp | 114 +- .../intel_cpu/src/transformation_pipeline.h | 6 +- .../skip_tests_config.cpp | 4 + .../shared_tests_instances/snippets/add.cpp | 42 +- .../snippets/conv_eltwise.cpp | 38 +- .../snippets/convert.cpp | 30 +- .../snippets/matmul.cpp | 70 ++ .../shared_tests_instances/snippets/mha.cpp | 67 + .../snippets/select.cpp | 42 + .../snippets/softmax.cpp | 72 ++ .../snippets/transpose.cpp | 27 + .../snippets/transpose_matmul.cpp | 63 + .../snippets/transpose_softmax.cpp | 42 + .../snippets/two_inputs_and_outputs.cpp | 3 +- .../functional/subgraph_tests/src/mha.cpp | 46 +- .../snipptes_mark_skipped.cpp | 14 +- .../fake_quantize_tokenization_test.cpp | 2 +- .../plugin/shared/include/snippets/add.hpp | 17 + .../shared/include/snippets/convert.hpp | 2 +- .../plugin/shared/include/snippets/matmul.hpp | 70 ++ .../plugin/shared/include/snippets/mha.hpp | 47 + .../plugin/shared/include/snippets/select.hpp | 59 + .../shared/include/snippets/softmax.hpp | 49 + .../include/snippets/three_inputs_eltwise.hpp | 10 +- .../shared/include/snippets/transpose.hpp | 32 + .../include/snippets/transpose_matmul.hpp | 33 + .../include/snippets/transpose_softmax.hpp | 40 + .../snippets/two_inputs_and_outputs.hpp | 2 +- .../plugin/shared/src/snippets/add.cpp | 38 + .../plugin/shared/src/snippets/convert.cpp | 38 +- .../plugin/shared/src/snippets/matmul.cpp | 168 +++ .../src/snippets/max_num_params_eltwise.cpp | 4 +- .../plugin/shared/src/snippets/mha.cpp | 125 ++ .../plugin/shared/src/snippets/select.cpp | 114 ++ .../plugin/shared/src/snippets/softmax.cpp | 91 ++ .../src/snippets/three_inputs_eltwise.cpp | 1 + .../plugin/shared/src/snippets/transpose.cpp | 52 + .../shared/src/snippets/transpose_matmul.cpp | 57 + .../shared/src/snippets/transpose_softmax.cpp | 82 ++ .../src/snippets/two_inputs_and_outputs.cpp | 8 +- .../src/base/utils/generate_inputs.cpp | 5 + .../include/snippets_helpers.hpp | 6 +- .../include/subgraph_converts.hpp | 16 +- .../include/subgraph_customizable.hpp | 3 +- .../include/subgraph_lowered.hpp | 45 +- .../include/subgraph_matmul.hpp | 96 ++ .../include/subgraph_mha.hpp | 131 ++ .../include/subgraph_simple.hpp | 69 +- .../include/subgraph_softmax.hpp | 57 + .../include/subgraph_transpose.hpp | 36 + .../src/snippets_helpers.cpp | 4 +- .../src/subgraph_customizable.cpp | 4 +- .../src/subgraph_lowered.cpp | 366 +++++- .../src/subgraph_matmul.cpp | 92 ++ .../src/subgraph_mha.cpp | 348 ++++++ .../src/subgraph_simple.cpp | 58 +- .../src/subgraph_softmax.cpp | 52 + .../src/subgraph_transpose.cpp | 32 + 176 files changed, 10025 insertions(+), 1664 deletions(-) create mode 100644 src/common/snippets/include/snippets/op/brgemm.hpp create mode 100644 src/common/snippets/include/snippets/op/buffer.hpp create mode 100644 src/common/snippets/include/snippets/op/fill.hpp create mode 100644 src/common/snippets/include/snippets/op/horizon_max.hpp create mode 100644 src/common/snippets/include/snippets/op/horizon_sum.hpp create mode 100644 src/common/snippets/include/snippets/op/loop.hpp create mode 100644 src/common/snippets/include/snippets/op/memory_access.hpp delete mode 100644 src/common/snippets/include/snippets/op/tile.hpp delete mode 100644 src/common/snippets/include/snippets/op/tile_scheduler.hpp create mode 100644 src/common/snippets/include/snippets/op/vector_buffer.hpp create mode 100644 src/common/snippets/include/snippets/pass/broadcast_to_movebroadcast.hpp create mode 100644 src/common/snippets/include/snippets/pass/explicit_transpose_matmul_inputs.hpp create mode 100644 src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp create mode 100644 src/common/snippets/include/snippets/pass/insert_buffer.hpp create mode 100644 src/common/snippets/include/snippets/pass/insert_loops.hpp create mode 100644 src/common/snippets/include/snippets/pass/loop_fusion.hpp create mode 100644 src/common/snippets/include/snippets/pass/loop_helpers.hpp create mode 100644 src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp create mode 100644 src/common/snippets/include/snippets/pass/mha_tokenization.hpp create mode 100644 src/common/snippets/include/snippets/pass/reset_buffer.hpp create mode 100644 src/common/snippets/include/snippets/pass/softmax_decomposition.hpp create mode 100644 src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp create mode 100644 src/common/snippets/include/snippets/pass/tokenization.hpp create mode 100644 src/common/snippets/include/snippets/pass/transpose_decomposition.hpp create mode 100644 src/common/snippets/src/op/brgemm.cpp create mode 100644 src/common/snippets/src/op/buffer.cpp create mode 100644 src/common/snippets/src/op/fill.cpp create mode 100644 src/common/snippets/src/op/horizon_max.cpp create mode 100644 src/common/snippets/src/op/horizon_sum.cpp create mode 100644 src/common/snippets/src/op/loop.cpp create mode 100644 src/common/snippets/src/op/memory_access.cpp delete mode 100644 src/common/snippets/src/op/powerstatic.cpp delete mode 100644 src/common/snippets/src/op/tile.cpp delete mode 100644 src/common/snippets/src/op/tile_scheduler.cpp create mode 100644 src/common/snippets/src/op/vector_buffer.cpp create mode 100644 src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp create mode 100644 src/common/snippets/src/pass/explicit_transpose_matmul_inputs.cpp create mode 100644 src/common/snippets/src/pass/fuse_transpose_brgemm.cpp create mode 100644 src/common/snippets/src/pass/insert_buffer.cpp create mode 100644 src/common/snippets/src/pass/insert_loops.cpp create mode 100644 src/common/snippets/src/pass/loop_fusion.cpp create mode 100644 src/common/snippets/src/pass/loop_helpers.cpp create mode 100644 src/common/snippets/src/pass/matmul_to_brgemm.cpp create mode 100644 src/common/snippets/src/pass/mha_tokenization.cpp create mode 100644 src/common/snippets/src/pass/reset_buffer.cpp create mode 100644 src/common/snippets/src/pass/softmax_decomposition.cpp create mode 100644 src/common/snippets/src/pass/softmax_reshape_elimination.cpp create mode 100644 src/common/snippets/src/pass/tokenization.cpp create mode 100644 src/common/snippets/src/pass/transpose_decomposition.cpp create mode 100644 src/common/snippets/tests/include/pass/broadcast_to_movebroadcast.hpp create mode 100644 src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp create mode 100644 src/common/snippets/tests/include/pass/mha_tokenization.hpp create mode 100644 src/common/snippets/tests/include/pass/softmax_decomposition.hpp create mode 100644 src/common/snippets/tests/src/pass/broadcast_to_movebroadcast.cpp create mode 100644 src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp create mode 100644 src/common/snippets/tests/src/pass/merge_loops.cpp create mode 100644 src/common/snippets/tests/src/pass/mha_tokenization.cpp create mode 100644 src/common/snippets/tests/src/pass/softmax_decomposition.cpp create mode 100644 src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/select.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/matmul.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/mha.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/select.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/softmax.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/transpose.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/transpose_matmul.hpp create mode 100644 src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/matmul.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/mha.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/select.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/softmax.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/transpose.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/transpose_matmul.cpp create mode 100644 src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_transpose.hpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_mha.cpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_transpose.cpp diff --git a/src/common/snippets/CMakeLists.txt b/src/common/snippets/CMakeLists.txt index 702543cfcf7..d3a7e47c604 100644 --- a/src/common/snippets/CMakeLists.txt +++ b/src/common/snippets/CMakeLists.txt @@ -26,9 +26,10 @@ ie_faster_build(${TARGET_NAME} ) target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime - PRIVATE ngraph_reference openvino::runtime::dev) + PRIVATE ngraph_reference ov_shape_inference openvino::runtime::dev) -target_include_directories(${TARGET_NAME} PUBLIC $) +target_include_directories(${TARGET_NAME} PUBLIC $ + PRIVATE $) add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}) diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp index 058e45c62b0..ab3156a108e 100644 --- a/src/common/snippets/include/snippets/generator.hpp +++ b/src/common/snippets/include/snippets/generator.hpp @@ -84,7 +84,7 @@ public: * @param f can this kernel be linearided to 1D range * @param p pointer to generated code */ - Schedule(const Shape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {} + Schedule(const ov::PartialShape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {} /** * @brief Returns callable instanse of code pointer */ @@ -92,7 +92,7 @@ public: return reinterpret_cast(const_cast(ptr)); } - Shape work_size {}; + ov::PartialShape work_size {}; bool is_flat {false}; code ptr {nullptr}; }; @@ -112,21 +112,43 @@ public: * @brief Default destructor */ virtual ~Generator() = default; + /** + * @interface GeneratorConfig + * @brief Allows to tweak the lowering process. + */ + class GeneratorConfig { + public: + // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission. + bool m_save_lowered_code = false; + // True if we can optimize tails for single evaluation during code generation + // More details with optimization examples you can see in generate() method + // For example, tails with Buffer ops doesn't support single evaluation optimizations + // because of that we should always reset memory pointer using finalization offsets + // after data storing to Buffer + bool m_optimize_single_evaluation = true; + // True if we should check runtime info for nodes to call specific needed transformations + bool m_need_fill_tail_register = false; + }; /** * @brief virtual method any specific implementation should implement * @param m model in canonical for for table-based code generation + * @param config config with transformation and optimization parameters + * @param compile_params parameters for generated code * @return pointer to generated code */ - code generate(std::shared_ptr& m, const void* compile_params = nullptr) const; + code generate(std::shared_ptr& m, const GeneratorConfig& config, const void* compile_params = nullptr); /** * @brief gets target machine * @return pointer to constant target machine */ - std::shared_ptr get_target_machine() const { return target; } + std::shared_ptr get_target_machine() const; protected: std::shared_ptr target; + // todo: we need to save lowered code to access compiled brgemm kernels on execution time (normally lowered is destructed by then). + // This is temporary solution, remove this when kernel caching is implemented. Don't forget to make generate const method. + std::vector lowered_saved; }; } // namespace snippets diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp new file mode 100644 index 00000000000..2746d974a06 --- /dev/null +++ b/src/common/snippets/include/snippets/op/brgemm.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" +#include "ngraph/op/matmul.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Brgemm + * @brief Brgemm is a batch-reduced matrix multiplication with the support of arbitrary strides between matrices rows + * @ingroup snippets + */ +class Brgemm : public ngraph::op::v0::MatMul { +public: + OPENVINO_OP("Brgemm", "SnippetsOpset", ngraph::op::v0::MatMul); + Brgemm(const Output& A, const Output& B, const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu); + Brgemm() = default; + + bool visit_attributes(AttributeVisitor& visitor) override; + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + bool has_evaluate() const override { return false; } + + size_t get_offset_a() const { return m_offset_a; } + size_t get_offset_b() const { return m_offset_b; } + size_t get_offset_c() const { return m_offset_c; } + + void set_offset_a(const size_t offset) { m_offset_a = offset; } + void set_offset_b(const size_t offset) { m_offset_b = offset; } + void set_offset_c(const size_t offset) { m_offset_c = offset; } + +private: + size_t m_offset_a = 0lu; // offset for first input + size_t m_offset_b = 0lu; // offset for second input + size_t m_offset_c = 0lu; // offset for output +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/broadcastload.hpp b/src/common/snippets/include/snippets/op/broadcastload.hpp index 8dce9ee2ab9..43f3a329adc 100644 --- a/src/common/snippets/include/snippets/op/broadcastload.hpp +++ b/src/common/snippets/include/snippets/op/broadcastload.hpp @@ -21,12 +21,18 @@ class BroadcastLoad : public BroadcastMove { public: OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ngraph::snippets::op::BroadcastMove); - BroadcastLoad(const Output& x, Shape output_shape); + BroadcastLoad(const Output& x, ov::PartialShape output_shape, size_t offset = 0lu); BroadcastLoad() = default; - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + size_t get_offset() const { return m_offset; } + void set_offset(const size_t offset) { m_offset = offset; } + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; + +private: + size_t m_offset = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/broadcastmove.hpp b/src/common/snippets/include/snippets/op/broadcastmove.hpp index cdcca462cc0..0d6368970b8 100644 --- a/src/common/snippets/include/snippets/op/broadcastmove.hpp +++ b/src/common/snippets/include/snippets/op/broadcastmove.hpp @@ -19,7 +19,7 @@ class BroadcastMove : public ngraph::op::Op { public: OPENVINO_OP("BroadcastMove", "SnippetsOpset"); - BroadcastMove(const Output& x, Shape output_shape); + BroadcastMove(const Output& x, ov::PartialShape output_shape); BroadcastMove() = default; bool visit_attributes(AttributeVisitor& visitor) override; @@ -28,12 +28,9 @@ public: void validate_and_infer_types() override; - OPENVINO_SUPPRESS_DEPRECATED_START - bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override; - OPENVINO_SUPPRESS_DEPRECATED_END protected: - Shape output_shape; + ov::PartialShape output_shape; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp new file mode 100644 index 00000000000..f75fc95e742 --- /dev/null +++ b/src/common/snippets/include/snippets/op/buffer.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Buffer + * @brief The operation is for intermediate data storage + * - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank]. + * It's needed to allocate needed memory size that depends on Tile rank, for example. + * Default value is -1 (full shape) + * Notes: + * - All buffers in a graph have the same memory pointer. So if we have a few buffers, + * each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer + * - Buffer should be a single consumer for operation output port + * @ingroup snippets + */ +class Buffer : public ngraph::op::Op { +public: + OPENVINO_OP("Buffer", "SnippetsOpset"); + + Buffer(const Output& x, const int32_t allocation_rank = -1); + Buffer() = default; + + int32_t get_allocation_rank() const { return m_allocation_rank; } + void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; } + + size_t get_byte_size() const; + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +private: + int32_t m_allocation_rank = -1; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp new file mode 100644 index 00000000000..85b95ec3799 --- /dev/null +++ b/src/common/snippets/include/snippets/op/fill.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface Fill + * @brief Generated in Tail Loop vector representation in code generation step for cases when we should + * refill registers by special values. + * For example, for cases with ReduceMax or ReduceSum in Softmax + * Where: + * - offset - starting element index where filling is performed while beginning of input data is untouched + * - fill_value - hexadecimal filling value + * @ingroup snippets + */ +class Fill : public ngraph::op::Op { +public: + OPENVINO_OP("Fill", "SnippetsOpset"); + + Fill(const Output& x, const size_t offset, const uint32_t fill_value = 0x0); + Fill() = default; + + size_t get_offset() const { return m_offset; } + uint32_t get_fill_value() const { return m_fill_value; } + + void set_offset(const size_t offset) { m_offset = offset; } + void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; } + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +protected: + size_t m_offset = 0lu; + uint32_t m_fill_value = 0x0; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/horizon_max.hpp b/src/common/snippets/include/snippets/op/horizon_max.hpp new file mode 100644 index 00000000000..d26c4a8c9e5 --- /dev/null +++ b/src/common/snippets/include/snippets/op/horizon_max.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface HorizonMax + * @brief The operation calculates a horizon maximum of a vector register + * @ingroup snippets + */ +class HorizonMax : public ngraph::op::Op { +public: + OPENVINO_OP("HorizonMax", "SnippetsOpset"); + + HorizonMax(const Output& x); + HorizonMax() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/horizon_sum.hpp b/src/common/snippets/include/snippets/op/horizon_sum.hpp new file mode 100644 index 00000000000..2dc25374bc0 --- /dev/null +++ b/src/common/snippets/include/snippets/op/horizon_sum.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface HorizonSum + * @brief The operation calculates a horizon sum of a vector register + * @ingroup snippets + */ +class HorizonSum : public ngraph::op::Op { +public: + OPENVINO_OP("HorizonSum", "SnippetsOpset"); + + HorizonSum(const Output& x); + HorizonSum() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/kernel.hpp b/src/common/snippets/include/snippets/op/kernel.hpp index f0b64de2b9a..a8d17745fde 100644 --- a/src/common/snippets/include/snippets/op/kernel.hpp +++ b/src/common/snippets/include/snippets/op/kernel.hpp @@ -20,13 +20,14 @@ class Kernel : public ngraph::op::Op { public: OPENVINO_OP("Kernel", "SnippetsOpset"); - Kernel(const std::vector, ngraph::snippets::RegInfo>>& region); + Kernel(std::vector region, std::shared_ptr m); Kernel() = default; - std::vector, ngraph::snippets::RegInfo>> region; + std::vector region; + const std::shared_ptr model; std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { - return std::make_shared(region); + return std::make_shared(region, model); } const void *compile_params = nullptr; }; diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index 2d412778035..bd0a4c5463f 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include "snippets/op/memory_access.hpp" namespace ngraph { namespace snippets { @@ -12,36 +13,41 @@ namespace op { /** * @interface Load - * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading - * where number of elements to load is determined by "count" - * Default value is "1" - to load one element + * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data loading + * where number of elements to load is determined by "count" (Default value is "1" - to load one element) + * and memory offset for loading is determined by "offset" (Default value is "0" - to load starting from the first element) * @ingroup snippets */ -class Load : public ngraph::op::Op { +class Load : public MemoryAccess { public: OPENVINO_OP("Load", "SnippetsOpset"); - Load(const Output& x, const size_t count = 1lu); + Load(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Load() = default; - size_t get_count() const { return m_count; } - - void set_count(const size_t count) { m_count = count; } - - bool visit_attributes(AttributeVisitor& visitor) override; - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - - void validate_and_infer_types() override; - - OPENVINO_SUPPRESS_DEPRECATED_START - bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override; - OPENVINO_SUPPRESS_DEPRECATED_END - -protected: - size_t m_count = 0lu; }; +/** + * @interface LoadReshape + * @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak + * shape propagation. We need it to keep correct shape propagation when Transpose is decomposed to + * Load and Store. This is a temporary solution until tokenization of Reshape operation is supported. + * @ingroup snippets + */ +class LoadReshape : public Load { +public: + OPENVINO_OP("LoadReshape", "SnippetsOpset", Load); + LoadReshape(const Output& x, size_t count = 1lu, const size_t offset = 0lu, std::vector order = {}); + LoadReshape() = default; + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +private: + std::vector m_order; +}; } // namespace op } // namespace snippets } // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/loop.hpp b/src/common/snippets/include/snippets/op/loop.hpp new file mode 100644 index 00000000000..89cf0abd517 --- /dev/null +++ b/src/common/snippets/include/snippets/op/loop.hpp @@ -0,0 +1,111 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" +#include "snippets/emitter.hpp" +#include "ngraph/op/parameter.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface LoopBase + * @brief Base class for LoopBegin and LoopEnd + * @ingroup snippets + */ +class LoopBase : public ngraph::op::Op { +public: + OPENVINO_OP("LoopBase", "SnippetsOpset"); + LoopBase(const std::vector>& args, size_t work_amount, size_t increment); + LoopBase() = default; + bool visit_attributes(AttributeVisitor& visitor) override; + size_t get_work_amount() const; + size_t get_increment() const; + bool get_evaluate_once() const; + +protected: + size_t work_amount; + size_t work_amount_increment; + bool evaluate_once; // true if the Loop is executed only once, used to skip setting and testing the loop counter +}; +class LoopEnd; +/** + * @interface LoopBegin + * @brief Marks the start of the Loop region. + * Number of outputs always equals to the number of inputs (bypassed values) + 1 (edge to the corresponding LoopEnd) + * @param args - vector of input values, they are passed directly to output. + * @ingroup snippets + */ +class LoopBegin : public LoopBase { + friend LoopEnd; + +public: + OPENVINO_OP("LoopBegin", "SnippetsOpset", LoopBase); + explicit LoopBegin(const OutputVector& args); + LoopBegin() = default; + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + std::shared_ptr get_loop_end(); + // begin_address and input_regs are needed to communicate information between LoopBegin and LoopEnd emitters + const uint8_t* begin_address; + std::vector input_regs; + +private: + void validate_and_infer_types_except_LoopEnd(); + LoopBegin(const std::vector>& args, size_t work_amount, size_t work_amount_increment); +}; + +/** + * @interface LoopEnd + * @brief Marks the end of the Loop region and defines the loop properties. + * Number of outputs always equals to the number of inputs (bypassed values) - 1 (edge to the corresponding LoopEnd) + * @param args vector of input values + LoopBegin, all values except for the LoopBegin are passed directly to output. + * @param work_amount total number of evaluations to be processed by the loop + * @param increment number of evaluations processed in one iteration of the loop. + * @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration. + * should be used when Loop is connected to Parameters and/or Results. If apply_increment[i] == true then i-th i/o data + * pointer will be incremented by work_amount*data_size on every iteration. + * @param ptr_increments specifies i/o pointer increment performed on every iteration. This is an alternative to + * apply_increments, which enables more flexibility. + * @param finalization_offsets pointer increments that are be applied to i/o pointers before exiting the loop + * @ingroup snippets + */ +class LoopEnd : public LoopBase { +public: + OPENVINO_OP("LoopEnd", "SnippetsOpset", LoopBase); + LoopEnd(const std::vector>& args, size_t work_amount, size_t work_amount_increment, + std::vector apply_increment, std::vector finalization_offsets); + LoopEnd(const std::vector>& args, size_t work_amount, size_t work_amount_increment, + std::vector ptr_increments, std::vector finalization_offsets); + LoopEnd() = default; + std::shared_ptr get_loop_begin(); + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + const std::vector& get_finalization_offsets() const; + const std::vector& get_ptr_increments() const; + void set_finalization_offsets(std::vector offsets); + void set_ptr_increments(std::vector new_ptr_increments); + // update_ptr_increments resets non-zero increments to the new_increments. It's used when work_amount_increment is + // updated and we need to refresh ptr increments accordingly while respecting the broadcasting pattern + void update_ptr_increments(int64_t new_increment); + void set_work_amount(size_t new_work_amount); + void set_increment(size_t new_increment); + void set_evaluate_once(bool once); + // Used to propagate information about Loop structure, needed to simplify some optimizations. For example, + // to skip pointer increments when outer Loop is empty, and work_amount == vector_size (one inner vector Loop) + // true by default, the optimizations enabled if it's false; + bool has_outer_loop; + +private: + std::vector ptr_increments; + std::vector finalization_offsets; + size_t loop_io_size; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp new file mode 100644 index 00000000000..f1b2d8ebb2f --- /dev/null +++ b/src/common/snippets/include/snippets/op/memory_access.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface MemoryAccess + * @brief This is a base class for memory access operations (like Load and Store). + * It provides universal set/get interface to manipulate the number + * of elements accessed during one operation call ("count"). + * Default "count" value is "1" - it means to load/store one element + * @ingroup snippets + */ + +class MemoryAccess : public ngraph::op::Op { +public: + OPENVINO_OP("MemoryAccess", "SnippetsOpset"); + + size_t get_count() const; + size_t get_offset() const; + void set_count(const size_t count); + void set_offset(const size_t offset); + bool visit_attributes(AttributeVisitor& visitor) override; + void validate_and_infer_types() override; + +protected: + explicit MemoryAccess(const Output& x, size_t count = 1lu, size_t offset = 0lu); + MemoryAccess() = default; + size_t m_count = 0lu; + size_t m_offset = 0lu; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/powerstatic.hpp b/src/common/snippets/include/snippets/op/powerstatic.hpp index f4dbe12f9ba..2f4e3fbcfa2 100644 --- a/src/common/snippets/include/snippets/op/powerstatic.hpp +++ b/src/common/snippets/include/snippets/op/powerstatic.hpp @@ -20,7 +20,6 @@ namespace op { class PowerStatic : public ov::op::util::UnaryElementwiseArithmetic { public: OPENVINO_OP("PowerStatic", "SnippetsOpset", ov::op::util::UnaryElementwiseArithmetic); - BWDCMP_RTTI_DECLARATION; PowerStatic() = default; PowerStatic(const Output &arg, float power) : UnaryElementwiseArithmetic(arg), power(power) { diff --git a/src/common/snippets/include/snippets/op/scalar.hpp b/src/common/snippets/include/snippets/op/scalar.hpp index 009f3028e92..108a34d6005 100644 --- a/src/common/snippets/include/snippets/op/scalar.hpp +++ b/src/common/snippets/include/snippets/op/scalar.hpp @@ -19,7 +19,6 @@ namespace op { class Scalar : public ov::op::v0::Constant { public: OPENVINO_OP("Scalar", "SnippetsOpset", ov::op::v0::Constant); - BWDCMP_RTTI_DECLARATION; Scalar() = default; @@ -37,6 +36,7 @@ public: std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; + bool visit_attributes(AttributeVisitor& visitor) override; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp index dec50f179e9..38715cffc6c 100644 --- a/src/common/snippets/include/snippets/op/store.hpp +++ b/src/common/snippets/include/snippets/op/store.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include "snippets/op/memory_access.hpp" namespace ngraph { namespace snippets { @@ -12,34 +13,19 @@ namespace op { /** * @interface Store - * @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing - * where number of elements to store is determined by "count" - * Default value is "1" - to store one element + * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data storing + * where number of elements to store is determined by "count" (Default value is "1" - to store one element) + * and memory offset for storing is determined by "offset" (Default value is "0" - to store starting at start memory ptr) * @ingroup snippets */ -class Store : public ngraph::op::Op { +class Store : public MemoryAccess { public: OPENVINO_OP("Store", "SnippetsOpset"); - Store(const Output& x, const size_t count = 1lu); + Store(const Output& x, const size_t count = 1lu, const size_t offset = 0lu); Store() = default; - size_t get_count() const { return m_count; } - - void set_count(const size_t count) { m_count = count; } - - bool visit_attributes(AttributeVisitor& visitor) override; - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - - void validate_and_infer_types() override; - - OPENVINO_SUPPRESS_DEPRECATED_START - bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override; - OPENVINO_SUPPRESS_DEPRECATED_END - -protected: - size_t m_count = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index 569b9dae9bf..ec55f076301 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -26,7 +26,7 @@ namespace op { class Subgraph : public ov::op::util::SubGraphOp { public: OPENVINO_OP("Subgraph", "SnippetsOpset", ov::op::util::SubGraphOp); - BWDCMP_RTTI_DECLARATION; + enum {DYNAMIC_DIMENSION = 0xffffffffffffffff}; // < 1, 42, 17, 15, 16> < 0, 1, 2, 3, 1> // should be: @@ -69,7 +69,7 @@ public: // // D = < 1, 3, 17, 15, 32> < 0, 1, 2, 3, 4> // E = < 1, 3, 17, 1, 32> < 0, 1, 2, 3, 4> - using BlockedShape = std::tuple; + using BlockedShape = std::tuple; using BlockedShapeVector = std::vector; Subgraph() = default; @@ -86,80 +86,82 @@ public: // we introduce this method instead of using SubGraphOp::get_function() // to align naming with other methods - const std::shared_ptr & body_ptr() const { - return m_bodies[0]; - } + const std::shared_ptr& body_ptr() const { return m_bodies[0]; } + std::shared_ptr& body_ptr() { return m_bodies[0]; } - std::shared_ptr & body_ptr() { - return m_bodies[0]; - } + const ov::Model& body() const { return *m_bodies[0]; } + ov::Model& body() { return *m_bodies[0]; } - const ov::Model & body() const { - return *m_bodies[0]; - } + const std::shared_ptr& get_generator() const { return m_generator; } + std::shared_ptr & get_generator() { return m_generator; } - ov::Model & body() { - return *m_bodies[0]; - } - - const std::shared_ptr & get_generator() const { - return m_generator; - } - - std::shared_ptr & get_generator() { - return m_generator; - } - - size_t get_non_scalar_constants_count() const { - return m_non_scalar_constants_count; - } - - bool is_quantized() const { - return config.m_is_quantized; - } - - bool has_type_relaxed_ops() const { - return config.m_has_type_relaxed_ops; - } + size_t get_buffer_scratchpad_size() const { return m_buffer_scratchpad; } + size_t get_virtual_port_count() const { return m_virtual_port_count; } + bool is_buffer_needed() const { return m_buffer_needed; } + bool is_quantized() const { return config.m_is_quantized; } + bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; } + bool has_domain_sensitive_ops() const { return config.m_has_domain_sensitive_ops; } snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt, const void* compile_params = nullptr); snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const void* compile_params = nullptr); snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr); snippets::Schedule generate(const void* compile_params = nullptr); - Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes); + ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes); + std::vector reshape_body(const std::vector& input_shapes); + std::vector reshape_body(const std::vector& input_shapes); // plugin sets generator for a snippet to some specific generator. // it's going to be replaced with Jitters table later void set_generator(std::shared_ptr generator); - void set_non_scalar_constants_count(const size_t count); + void set_tile_rank(size_t newRank) {tileRank = newRank;} + void set_virtual_port_count(const size_t count); + void set_buffer_needed(const bool need); void print() const; void print_statistics(bool verbose); void serialize() const; + void set_master_shape(ov::PartialShape new_shape) {master_shape = std::move(new_shape);} static auto wrap_node_as_subgraph(const std::shared_ptr& node) -> std::shared_ptr; static void fill_empty_output_names(const Output& target_output_node, const Output& replacement_output_node); + // Non-scalar Constants are tokenized as Parameters inside Subgraph body but some operations with constant inputs + // should have explicit Constants even if they're non-scalar (Reshape, Transpose, Broadcast) + // This check returns True if Constant op which is input of this op should be inside Subgraph body + static auto constant_input_should_be_inside_body(const std::shared_ptr& node) -> bool; + private: void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes); void convert_to_snippet_dialect(); - - // Count of potentional non-scalar Consants that will be created after some tranformations - // At the moment it's relevant only for FakeQuantize decomposition - // NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()), + void init_config(); + void initialize_buffer_scratchpad_size(); + // Count of Subgraph virtual ports: + // - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition) + // Need Buffer op or not + // - Buffers. All Buffers are considered as one common additional virtual port. So we cannot summarize them as potential non-scalar Constants + // NOTE: To avoid overheads in each calculation of this count (for example, in validate_and_type_infer()), // we should MANUALLY calculate it where it needed. - size_t m_non_scalar_constants_count = 0; + size_t m_virtual_port_count = 0; + bool m_buffer_needed = false; + size_t m_buffer_scratchpad = 0lu; Shape exec_domain = {}; std::shared_ptr m_generator = nullptr; // TODO: Change logic of insert Converts. This exec element type can be different for plugins const ov::element::Type execution_element_type = ov::element::f32; - // Config to know which transformations should be called. - // It helps to avoid overheads of extra transformation calls - struct { + ov::PartialShape master_shape; + size_t tileRank = 0; // set by plugin to specify the number of dimensions processed in a single kernel call + + /** + * @interface SubgraphConfig + * @brief Config to optimize IR transformation pipeline. It indicates which transformations are necessary + * so the irrelevant ones could be skipped. + */ + class SubgraphConfig { + public: // True if Subgraph contains FakeQuantize -> FQ decomposition should be called bool m_is_quantized = false; // True if we should align element types indise body @@ -167,6 +169,12 @@ private: // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method bool m_has_type_relaxed_ops = false; + // True if body has operations that don't support plugin-side domain optimizations + // (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing) + bool m_has_domain_sensitive_ops = false; + // True if we should go through whole body to check for where loops should be explicitly inserted. + // Otherwise, we insert Loops on Parameters and Results - for example, it's optimized out for subgraph with only Eltwise ops + bool m_explicit_loop_insertion = false; } config; }; @@ -190,6 +198,24 @@ static inline auto build_subgraph(const std::shared_ptr& node, con return subgraph; }; +// Need to update tensor name manually, since intel_cpu::Graph::Replicate() looks at input.get_tensor().get_name(); +// If subgraph->get_output_size() == 1, then the name will be restored correctly from the node name +auto inline update_out_tensor_name(const std::shared_ptr& subgraph) -> void { + bool not_set = true; + for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) { + for (const auto &in : subgraph->get_output_target_inputs(i)) { + if (ov::is_type(in.get_node())) { + const auto& body_result = subgraph->body_ptr()->get_output_op(i); + const auto& body_result_input = body_result->get_input_source_output(0); + ngraph::snippets::op::Subgraph::fill_empty_output_names( + subgraph->output(i), body_result_input); + not_set = false; + break; + } + } + } +} + } // namespace op } // namespace snippets } // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/tile.hpp b/src/common/snippets/include/snippets/op/tile.hpp deleted file mode 100644 index 5401a91c657..00000000000 --- a/src/common/snippets/include/snippets/op/tile.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "ngraph/op/op.hpp" -#include "snippets/emitter.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface Tile - * @brief Generated by Canonicalization and represents Loop in affine notation - * @ingroup snippets - */ -class Tile : public ngraph::op::Op { -public: - OPENVINO_OP("Tile", "SnippetsOpset"); - - /// \brief Construct an Tile - /// \param region The vector of pairs: emitters and the corresponding registers - /// \param increment Tile size - count of elements to load and store. - /// Vector Tile should have size of vector register and Scalar Tile should have 1 - /// \param num_inputs Count of inputs - /// \param num_outputs Count of outputs - /// \param io_dims Vector of last dimensions of inputs and outputs - /// \param io_data_sizes Vector of data type sizes of inputs and outputs - Tile(const std::vector& region, size_t increment, size_t num_inputs, size_t num_outputs, - const std::vector& io_dims, const std::vector& io_data_sizes); - Tile() = default; - std::vector region; - size_t increment = 0; - size_t num_inputs = 0; - size_t num_outputs = 0; - std::vector io_dims {}; - std::vector io_data_size {}; - - std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { - return std::make_shared(region, increment, num_inputs, num_outputs, io_dims, io_data_size); - } -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/tile_scheduler.hpp b/src/common/snippets/include/snippets/op/tile_scheduler.hpp deleted file mode 100644 index 9d6010f7797..00000000000 --- a/src/common/snippets/include/snippets/op/tile_scheduler.hpp +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "ngraph/op/op.hpp" -#include "snippets/emitter.hpp" -#include "tile.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface TileScheduler - * @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations - * before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data - * have to be read several times (broadcasting). - * @ingroup snippets - */ -class TileScheduler : public ngraph::op::Op { -public: - OPENVINO_OP("TileScheduler", "SnippetsOpset"); - - TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region); - TileScheduler() = default; - AllocatedEmitter vector_region; - AllocatedEmitter scalar_region; - // todo: this clone_with_new_inputs is irrelevant - std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { - return std::make_shared(vector_region, scalar_region); - } - const void *compile_params; -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/vector_buffer.hpp b/src/common/snippets/include/snippets/op/vector_buffer.hpp new file mode 100644 index 00000000000..9d93e4c0157 --- /dev/null +++ b/src/common/snippets/include/snippets/op/vector_buffer.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface VectorBuffer + * @brief The operation is for intermediate data storage in vector register + * @ingroup snippets + */ +class VectorBuffer : public ngraph::op::Op { +public: + OPENVINO_OP("VectorBuffer", "SnippetsOpset"); + + VectorBuffer(const ov::element::Type element_type = ov::element::f32); + + bool visit_attributes(AttributeVisitor& visitor) override { return true;} + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +private: + ov::element::Type m_element_type; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/broadcast_to_movebroadcast.hpp b/src/common/snippets/include/snippets/pass/broadcast_to_movebroadcast.hpp new file mode 100644 index 00000000000..0c90c1193ea --- /dev/null +++ b/src/common/snippets/include/snippets/pass/broadcast_to_movebroadcast.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface BroadcastToMoveBroadcast + * @brief Inserts explicit MoveBroadcast instruction if broadcasting by most varying dimension is needed instead of Broadcast. + * Otherwise the pass removes Broadcast operation. + * @ingroup snippets + */ +class BroadcastToMoveBroadcast: public ngraph::pass::MatcherPass { +public: + BroadcastToMoveBroadcast(); +}; + + +} // namespace pass +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/pass/collapse_subgraph.hpp b/src/common/snippets/include/snippets/pass/collapse_subgraph.hpp index 2d6f7c0d963..96c272a28d5 100644 --- a/src/common/snippets/include/snippets/pass/collapse_subgraph.hpp +++ b/src/common/snippets/include/snippets/pass/collapse_subgraph.hpp @@ -12,28 +12,6 @@ namespace ngraph { namespace snippets { namespace pass { -/* - NotSet - default value returned by GetSnippetsNodeType(...) if the node wasn't marked - SkippedByPlugin - indicate that snippets can't include this node in subgraph. Can be set by Plugin via SetSnippetsNodeType(...). - */ -enum class SnippetsNodeType : int64_t {NotSet, SkippedByPlugin}; -void SetSnippetsNodeType(const std::shared_ptr&, SnippetsNodeType); -SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr&); -void SetTopologicalOrder(const std::shared_ptr&, int64_t); -int64_t GetTopologicalOrder(const std::shared_ptr&); -bool AppropriateForSubgraph(const std::shared_ptr&); - -/** - * @interface EnumerateNodes - * @brief Snippets rely on topological order to avoid creating cyclic dependencies. This transformation sets the order. - * @ingroup snippets - */ -class EnumerateNodes : public ov::pass::ModelPass { -public: - OPENVINO_RTTI("EnumerateNodes", "0"); - EnumerateNodes() : ModelPass() {} - bool run_on_model(const std::shared_ptr&) override; -}; /** * @interface TokenizeSnippets @@ -61,6 +39,10 @@ class TokenizeSnippets: public ngraph::pass::MatcherPass { public: OPENVINO_RTTI("TokenizeSnippets", "0"); explicit TokenizeSnippets(); + + static bool AppropriateForSubgraph(const std::shared_ptr&); + + static const std::set supported_element_types; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/pass/explicit_transpose_matmul_inputs.hpp b/src/common/snippets/include/snippets/pass/explicit_transpose_matmul_inputs.hpp new file mode 100644 index 00000000000..fc90067f4af --- /dev/null +++ b/src/common/snippets/include/snippets/pass/explicit_transpose_matmul_inputs.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface ExplicitTransposeMatMulInputs + * @brief At the moment Snippets supports Transpose only with order {0, 2, 3, 1}, + * so if there is pattern in graph: + * in0 Transpose{0, 2, 1, 3} + * \ / + * MatMul[false, true] + * We can set false in MatMul parameter `transposed_b` and + * change Transpose order to {0, 2, 3, 1} which is supported by Snippets + * @ingroup snippets + */ +class ExplicitTransposeMatMulInputs: public ngraph::pass::MatcherPass { +public: + ExplicitTransposeMatMulInputs(); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp new file mode 100644 index 00000000000..1c2eaa11ea0 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/pass/graph_rewrite.hpp" +#include "ngraph/pattern/matcher.hpp" + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface FuseTransposeBrgemm + * @brief Fuses Transpose with Brgemm node, fusing on both Brgemm inputs and output is supported. Applicable to + * Transposes that don't change the position of the last dimension (since Brgemm supports strided rows i/o), + * but only 0213 Transpose is currently supported. + * @ingroup snippets + */ +class FuseTransposeBrgemm: public ngraph::pass::MatcherPass { +public: + OPENVINO_RTTI("FuseTransposeBrgemm", "0"); + FuseTransposeBrgemm(); + static const std::set> supported_cases; +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/pass/insert_buffer.hpp b/src/common/snippets/include/snippets/pass/insert_buffer.hpp new file mode 100644 index 00000000000..a7fe4f00208 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/insert_buffer.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface InsertBuffer + * @brief The pass inserts Buffers on Inputs and Outputs of special operations [Softmax, Transpose] is it's needed + * @param allocation_rank - rank of shape for Buffer memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank]. + * It's needed to allocate needed memory size that depends on Tile rank, for example. + * Default value is -1 (full shape) + * @ingroup snippets + */ +class InsertBuffer: public ngraph::pass::MatcherPass { +public: + InsertBuffer(const int32_t allocation_rank = -1); +}; + + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/insert_load_store.hpp b/src/common/snippets/include/snippets/pass/insert_load_store.hpp index 09911e62d8b..aab892312bf 100644 --- a/src/common/snippets/include/snippets/pass/insert_load_store.hpp +++ b/src/common/snippets/include/snippets/pass/insert_load_store.hpp @@ -13,7 +13,7 @@ namespace pass { /** * @interface InsertLoad - * @brief Inserts explicit load instruction after each parameter. + * @brief Inserts explicit load instruction after each parameter and buffer. * The pass is used to convert model to a canonical form for code generation * @ingroup snippets */ @@ -24,7 +24,7 @@ public: /** * @interface InsertStore - * @brief Inserts explicit store instruction before each result. + * @brief Inserts explicit store instruction before each result and buffer. * The pass is used to convert model to a canonical form for code generation * @ingroup snippets */ diff --git a/src/common/snippets/include/snippets/pass/insert_loops.hpp b/src/common/snippets/include/snippets/pass/insert_loops.hpp new file mode 100644 index 00000000000..57046789167 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/insert_loops.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface InsertLoops + * @brief Insert explicit Loop operations into the body to process multiple data entities during one kernel execution + * @param master_shape - shape used to determine loop work amounts + * @param loop_depth - the number of last master_shape dimensions processed by loops (aka tileRank - obsolete), could be 1 or 2 + * @param vector_size - the number of entities processed on one iteration of vector loop + * @param single_loop_body - true, if we can just insert LoopBegin on inputs and LoopEnd on outputs, othwerwise + * the pass goes all over the body analyzing where LoopBegin and LoopEnd should be inserted: + * synchronization nodes are MatMul, Buffer and other already existing Loops. + * @ingroup snippets + */ +class InsertLoops: public ngraph::pass::FunctionPass { +public: + OPENVINO_RTTI("InsertLoops", "0"); + InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool is_optimized = true); + bool run_on_model(const std::shared_ptr& m) override; + + static std::vector calculate_inner_apply_increments(const ov::PartialShape& master, const std::vector& shapes); + static std::vector calculate_outer_apply_increments(const std::vector& shapes); + static std::vector calculate_finalization_offsets(const ov::PartialShape& master, const std::vector& shapes); +private: + ov::PartialShape m_master_shape; + size_t m_loop_depth; + size_t m_vector_size; + bool m_single_loop_body; +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp b/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp index 14fe951a12a..e0458e0b263 100644 --- a/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp +++ b/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp @@ -20,6 +20,10 @@ namespace pass { class InsertMoveBroadcast: public ngraph::pass::MatcherPass { public: InsertMoveBroadcast(); + + static Output BroadcastNodeLastDim(const ngraph::Output& value, + const ov::PartialShape& target_shape, + const ov::PartialShape& normalized_shape); }; } // namespace pass diff --git a/src/common/snippets/include/snippets/pass/loop_fusion.hpp b/src/common/snippets/include/snippets/pass/loop_fusion.hpp new file mode 100644 index 00000000000..14676a15a6e --- /dev/null +++ b/src/common/snippets/include/snippets/pass/loop_fusion.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface LoopFusion + * @brief Fuse Loops into one Loop if their semantics allow it + * @ingroup snippets + */ +class LoopFusion: public ngraph::pass::MatcherPass { +public: + LoopFusion(); + +private: + bool Merge(const std::shared_ptr& buffer); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/loop_helpers.hpp b/src/common/snippets/include/snippets/pass/loop_helpers.hpp new file mode 100644 index 00000000000..12e0e9746bc --- /dev/null +++ b/src/common/snippets/include/snippets/pass/loop_helpers.hpp @@ -0,0 +1,99 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" +#include "ngraph/op/parameter.hpp" +#include "snippets/op/loop.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/* ==== LoopBegin === */ +/** + * @interface insertLoopBeginAfterOutputs + * @brief Inserts LoopBegin operation after the group of operations described + * by the input argument (OutputVector). Use insertLoopBegin instead - it has a more universal interface. + * @ingroup snippets + */ +std::shared_ptr insertLoopBeginAfterOutputs(const OutputVector& originalOutputs); + +/** + * @interface insertLoopBegin + * @brief Inserts LoopBegin operation after the group of operations described + * by the input argument (ParameterVector, NodeVector or OutputVector). + * @ingroup snippets + */ +template +std::shared_ptr insertLoopBegin(const T& afterTheseNodes) { + static_assert(std::is_same() || std::is_same(), + "Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed"); + OutputVector originalOutputs; + std::vector>> childInputs; + for (const auto &n : afterTheseNodes) { + const auto& nodeOutputs = n->outputs(); + // Ignore the LoopBegin->LoopEnd edge to make it easier to construct enclosed Loops + std::move(nodeOutputs.begin(), nodeOutputs.end() - 1 * ov::is_type(n), std::back_inserter(originalOutputs)); + } + + return insertLoopBeginAfterOutputs(originalOutputs); +} + +template<> +inline std::shared_ptr insertLoopBegin(const OutputVector& afterTheseNodes) { + return insertLoopBeginAfterOutputs(afterTheseNodes); +} +/* ============== */ + +/* ==== LoopEnd === */ +/** + * @interface insertLoopBeginAfterOutputs + * @brief Inserts LoopBegin operation after the group of operations described + * by the input argument (vector of inputs). Use insertLoopEnd instead - it has a more universal interface. + * @param originalInputs LoopEnd will be inserted before these inputs + * @param loopBegin pointer to the beginning of the Loop region + * @param work_amount total number of evaluations to be processed by the loop + * @param increment number of evaluations processed in one iteration of the loop + * @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration. + * should be used when Loop is connected to Parameters and/or Results + * @param finalization_offsets pointer shifts that should be applied to data pointers before exiting the loop + * @ingroup snippets + */ + +std::shared_ptr insertLoopEndBeforeInputs(const std::vector>& originalInputs, + const std::shared_ptr& loopBegin, + size_t work_amount, size_t increment, + std::vector apply_increment = {}, + std::vector finalization_offsets = {}); + +/** + * @interface insertLoopEnd + * @brief Inserts LoopEnd operation before the group of operations described + * by the input argument (ResultVector, NodeVector or OutputVector). + * @ingroup snippets + */ +template +std::shared_ptr insertLoopEnd(const T& beforeTheseNodes, Args ...args) { + static_assert(std::is_same() || std::is_same(), + "Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed"); + std::vector> originalInputs; + for (const auto &n : beforeTheseNodes) { + const auto& nodeInputs = n->inputs(); + // Ignore the LoopBegin->LoopEnd edge to facilitate enclosed Loops construction + std::move(nodeInputs.begin(), nodeInputs.end() - 1 * ov::is_type(n), std::back_inserter(originalInputs)); + } + return insertLoopEndBeforeInputs(originalInputs, args...); +} + +template +std::shared_ptr insertLoopEnd(const std::vector>& beforeTheseNodes, Args ...args) { + return insertLoopEndBeforeInputs(beforeTheseNodes, args...); +} +/* ============== */ + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp b/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp new file mode 100644 index 00000000000..1f00b944b56 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/pass/graph_rewrite.hpp" +#include "ngraph/pattern/matcher.hpp" + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface MatMulToBrgemm + * @brief Replaces ngraph::MatMul with snippets::op::Brgemm operation (only non-trasposing MatMuls are currently supported) + * @ingroup snippets + */ +class MatMulToBrgemm: public ngraph::pass::MatcherPass { +public: + OPENVINO_RTTI("MatMulToBrgemm", "0"); + MatMulToBrgemm(); +}; + + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/mha_tokenization.hpp b/src/common/snippets/include/snippets/pass/mha_tokenization.hpp new file mode 100644 index 00000000000..7c161e8447e --- /dev/null +++ b/src/common/snippets/include/snippets/pass/mha_tokenization.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface TokenizeMHASnippets + * @brief The pass tokenizes MHA-pattern into Subgraph + * TODO: Write pattern + * @ingroup snippets + */ +class TokenizeMHASnippets: public ngraph::pass::MatcherPass { +public: + OPENVINO_RTTI("TokenizeMHASnippets", "0"); + TokenizeMHASnippets(); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/reset_buffer.hpp b/src/common/snippets/include/snippets/pass/reset_buffer.hpp new file mode 100644 index 00000000000..599b533e3eb --- /dev/null +++ b/src/common/snippets/include/snippets/pass/reset_buffer.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface ResetBufferState + * @brief If there is Buffer between loops we should reset Buffer pointer after first loop execution (data storing) using finalization offsets + * to have correct buffer data pointer for data loading in the next loop where data was stored in previous loop + * @ingroup snippets + */ +class ResetBufferState: public ngraph::pass::MatcherPass { +public: + ResetBufferState(); + + static int64_t calculate_required_finalization_offsets(const size_t inner_master_work_amount, const size_t inner_target_work_amount); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp new file mode 100644 index 00000000000..b640ab35b0b --- /dev/null +++ b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SoftmaxDecomposition + * @brief The pass decomposise Softmax into explicit Snippets dialects + * Note: + * - At the moment Snippets supports Softmax only in MHA pattern where there are Buffer ops before and after Softmax. + * Also Snippets support Loops with Buffer ops on inputs and outputs if Buffer have the same buffer byte size + * because of work with ptr increment. So we have to set Tile rank as buffer allocation rank even if rank 1 is enough + * @ingroup snippets + */ +class SoftmaxDecomposition: public ngraph::pass::MatcherPass { +public: + SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank = -1); +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp b/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp new file mode 100644 index 00000000000..7522f411669 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface SoftmaxReshapeElimination + * @brief The pass removes Reshape operations around Softmax if possible + * @ingroup snippets + */ +class SoftmaxReshapeElimination: public ngraph::pass::MatcherPass { +public: + SoftmaxReshapeElimination(); +}; + + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/tokenization.hpp b/src/common/snippets/include/snippets/pass/tokenization.hpp new file mode 100644 index 00000000000..19b776ec257 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/tokenization.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "snippets/pass/mha_tokenization.hpp" +#include "snippets/pass/collapse_subgraph.hpp" + +namespace ngraph { +namespace snippets { +namespace pass { + +/* + NotSet - default value returned by GetSnippetsNodeType(...) if the node wasn't marked + SkippedByPlugin - indicate that snippets can't include this node in subgraph. Can be set by Plugin via SetSnippetsNodeType(...). + */ +enum class SnippetsNodeType : int64_t {NotSet, SkippedByPlugin}; +void SetSnippetsNodeType(const std::shared_ptr&, SnippetsNodeType); +SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr&); +void SetTopologicalOrder(const std::shared_ptr&, int64_t); +int64_t GetTopologicalOrder(const std::shared_ptr&); + +/** + * @interface EnumerateNodes + * @brief Snippets rely on topological order to avoid creating cyclic dependencies. This transformation sets the order. + * @ingroup snippets + */ +class EnumerateNodes : public ov::pass::ModelPass { +public: + OPENVINO_RTTI("EnumerateNodes", "0"); + EnumerateNodes() : ModelPass() {} + bool run_on_model(const std::shared_ptr&) override; +}; + + +/** + * @interface SnippetsTokenization + * @brief Splits model to supported subgraphs + * 1. Enumerate nodes by topological order + * 2. MHA tokenization + * 3. Common tokenization + * 4. Some common transformations for Subgraphs. For example, FakeQuantize decomposition + * @ingroup snippets + */ +class SnippetsTokenization : public ngraph::pass::FunctionPass { +public: + OPENVINO_RTTI("SnippetsTokenization", "0"); + bool run_on_model(const std::shared_ptr& m) override; +}; + + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp b/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp new file mode 100644 index 00000000000..9f939eea4b7 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface TransposeDecomposition + * @brief Decompose Transpose to Load + Store wrapped in several loops. + * @ingroup snippets + */ +class TransposeDecomposition: public ngraph::pass::MatcherPass { +public: + OPENVINO_RTTI("TransposeDecomposition", "0"); + TransposeDecomposition(); + static const std::set> supported_cases; +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index d3f8957e2fc..af489925c51 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -9,16 +9,21 @@ #include "op/broadcastload.hpp" #include "op/broadcastmove.hpp" +#include "op/buffer.hpp" #include "op/convert_saturation.hpp" #include "op/convert_truncation.hpp" +#include "op/horizon_max.hpp" +#include "op/horizon_sum.hpp" +#include "op/fill.hpp" #include "op/kernel.hpp" #include "op/load.hpp" #include "op/nop.hpp" #include "op/scalar.hpp" #include "op/powerstatic.hpp" #include "op/store.hpp" -#include "op/tile.hpp" -#include "op/tile_scheduler.hpp" +#include "op/loop.hpp" +#include "op/brgemm.hpp" +#include "op/vector_buffer.hpp" namespace ngraph { namespace snippets { diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp index 09380faf4e2..1816322bb36 100644 --- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp +++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp @@ -11,6 +11,10 @@ // SnippetS dialect NGRAPH_OP(Load, ngraph::snippets::op) +NGRAPH_OP(LoadReshape, ngraph::snippets::op) +NGRAPH_OP(LoopBegin, ngraph::snippets::op) +NGRAPH_OP(LoopEnd, ngraph::snippets::op) +NGRAPH_OP(Brgemm, ngraph::snippets::op) NGRAPH_OP(BroadcastLoad, ngraph::snippets::op) NGRAPH_OP(Store, ngraph::snippets::op) diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp index 97447ddd648..253785b516d 100644 --- a/src/common/snippets/include/snippets/utils.hpp +++ b/src/common/snippets/include/snippets/utils.hpp @@ -11,6 +11,7 @@ #include "snippets_isa.hpp" #include "emitter.hpp" + namespace ngraph { namespace snippets { namespace utils { @@ -23,6 +24,15 @@ inline auto is_scalar_constant(const std::shared_ptr& source_outpu return ngraph::is_type(source_output_node) && ngraph::shape_size(source_output_node->get_shape()) == 1; } + +ov::PartialShape get_port_planar_shape(const Output& out); +ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector& layout); +std::vector get_node_output_layout(const std::shared_ptr& node); +std::vector get_node_output_layout(const Node* node); + +inline ov::Dimension get_inner_dim(const ov::PartialShape &shape) { return *(shape.rbegin()); } +inline ov::Dimension get_outer_dim(const ov::PartialShape &shape) { return *(shape.rbegin() + 1); } + } // namespace utils } // namespace snippets } // namespace ngraph diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 6ff07d977ae..c305db67f01 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -6,106 +6,219 @@ #include "snippets/pass/assign_registers.hpp" #include "snippets/pass/vector_to_scalar.hpp" #include "snippets/pass/insert_load_store.hpp" -#include "snippets/op/tile.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/op/subgraph.hpp" #include "snippets/op/kernel.hpp" #include #include +#include -auto ngraph::snippets::getRegisters(std::shared_ptr& n) -> ngraph::snippets::RegInfo { +namespace ngraph { +namespace snippets { + +auto getRegisters(const std::shared_ptr &n) -> RegInfo { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::getRegisters") - auto rt = n->get_rt_info(); // ToDo: change to reg_t std::vector rin, rout; - auto it_rt = rt.find("reginfo"); - if (it_rt != rt.end()) { - for (auto reg : it_rt->second.as>()) { - rout.push_back(reg); - } + for (const auto& output : n->outputs()) { + const auto& rt = output.get_tensor_ptr()->get_rt_info(); + auto it_rt = rt.find("reginfo"); + if (it_rt != rt.end()) + rout.push_back(it_rt->second.as()); } for (const auto& input : n->inputs()) { - auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info(); + auto rt = input.get_source_output().get_tensor_ptr()->get_rt_info(); auto it_rt = rt.find("reginfo"); - if (it_rt != rt.end()) { - for (auto& reg : it_rt->second.as>()) { - rin.push_back(reg); - } - } + if (it_rt != rt.end()) + rin.push_back(it_rt->second.as()); } + return std::make_pair(rin, rout); } +auto tail_transformations(NodeVector& tail, const size_t tail_size, const ngraph::snippets::Generator::GeneratorConfig& config) -> void { + NodeVector updated_tile; + auto insertFill = [tail_size](const ov::Input& input) -> std::shared_ptr { + auto copyRegInfo = [](const ov::descriptor::Tensor& from, ov::descriptor::Tensor& to) -> void { + auto rt = from.get_rt_info(); + auto reginfo = rt.find("reginfo"); + if (reginfo != rt.end()) { + to.get_rt_info()["reginfo"] = reginfo->second; + } + }; + std::shared_ptr fill = nullptr; + auto& rt = input.get_rt_info(); + auto fill_rt = rt.find("set_fill"); + if (fill_rt != rt.end()) { + const auto fill_value = fill_rt->second.as(); + fill = std::make_shared(input.get_source_output(), tail_size, fill_value); + input.get_node()->set_argument(input.get_index(), fill); + // we should explicitly copy reg info because we insert Fill after assign register + copyRegInfo(fill->get_input_tensor(0), fill->get_output_tensor(0)); + } + return fill; + }; + + for (auto& op : tail) { + // We should fill vector regs by float_min and zero to have + // correct math calculations for ReduceMax and ReduceSum in scalar case. + // Note: We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop, + // so they are missed in + if (config.m_need_fill_tail_register && + (ov::is_type(op) || + ov::is_type(op))) { + for (auto i = 0; i < op->inputs().size(); ++i) { + if (auto fill = insertFill(op->input(i))) { + updated_tile.push_back(fill); + } + } + } else if (const auto memory_access = std::dynamic_pointer_cast(op)) { + if (memory_access->get_count() != 1) { + memory_access->set_count(tail_size); + } + } + updated_tile.push_back(op); + } + + tail = std::move(updated_tile); +} + ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr& m, - const void* compile_params) const { + const GeneratorConfig& config, + const void* compile_params) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate") if (!target->is_supported()) - throw ngraph_error("unsupported architecture for code genration"); - - auto params = m->get_parameters(); - auto results = m->get_results(); - auto in = params.size(); - auto out = results.size(); - std::vector io_last_dims(in + out); - std::vector io_data_sizes(in + out); - std::transform(params.begin(), params.end(), io_last_dims.begin(), - [](const std::shared_ptr& n){return n->get_output_shape(0).back();}); - std::transform(results.begin(), results.end(), io_last_dims.begin() + in, - [](const std::shared_ptr& n){return n->get_input_shape(0).back();}); - std::transform(params.begin(), params.end(), io_data_sizes.begin(), - [](const std::shared_ptr& n){return n->get_element_type().size();}); - std::transform(results.begin(), results.end(), io_data_sizes.begin() + in, - [](const std::shared_ptr& n){return n->get_element_type().size();}); + throw ngraph_error("unsupported architecture for code generation"); OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile") - // vector tile + // vector loop std::vector lowered; - for (auto n : m->get_ordered_ops()) { - lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); - } - OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile") + auto lower_ops = [&lowered, this](const NodeVector& ops){ + std::transform(ops.begin(), ops.end(), std::back_inserter(lowered), + [this](const std::shared_ptr& n){ + return std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)); + }); + }; + // *1* solo vector/tail loop + empty outer loop + // => skip increments (both counter & ptr) : set evaluate_once flag + // *2* solo vector/tail loop + non-empty outer loop + // => skip counter increments but perform ptr increments : set evaluate_once, + // and perform pointer increments through finalization offsets + // *3* vector loop(s) + one tail loop + // => vector as usual, tail depends on outer loop, see *1* and *2* + auto optimize_single_evaluation = [](const std::shared_ptr& loop, bool force_ptr_increment = false) { + if (loop->get_work_amount() < 2 * loop->get_increment()) { + loop->set_evaluate_once(true); + if (force_ptr_increment || loop->has_outer_loop) { + std::vector new_finalization_offsets(loop->get_finalization_offsets()); + const auto& ptr_increments = loop->get_ptr_increments(); + for (auto i = 0; i < new_finalization_offsets.size(); i++) { + new_finalization_offsets[i] += ptr_increments[i]; + } + loop->set_finalization_offsets(new_finalization_offsets); + } + return true; + } else { + return false; + } + }; + const auto& ops = m->get_ordered_ops(); + for (auto op = ops.begin(); op < ops.end(); op++) { + const auto& loop_begin = ov::as_type_ptr(*op); - // scalar tile - auto m_scalar = ov::clone_model(*m.get()); - ngraph::pass::Manager mng; - mng.register_pass(); - mng.register_pass(); - mng.run_passes(m_scalar); - OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get") - std::vector scalar_lowered; - for (auto n : m_scalar->get_ordered_ops()) { - scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); - } - OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D"); - // wrapping into tiles1D - //todo: in, out, and io_last_dims should derive naturally from the graph representation - const auto& vector_tile = std::make_shared(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes); - const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile), - std::make_pair(std::vector{}, std::vector{})); - const auto& scalar_tile = std::make_shared(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes); - const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile), - std::make_pair(std::vector{}, std::vector{})); + // ignore outer loops and possible manual scalar loops + if (loop_begin && loop_begin->get_increment() != 1) { + OV_ITT_TASK_NEXT(GENERATE, "::VectorLoop") + NodeVector vector_loop, tail_loop; + std::shared_ptr vector_loop_end, tail_loop_end; + vector_loop_end = loop_begin->get_loop_end(); + tail_loop_end = nullptr; + while (*op != vector_loop_end) + vector_loop.push_back(*op++); + vector_loop.push_back(*op); + const auto work_amount = vector_loop_end->get_work_amount(); + const auto increment = vector_loop_end->get_increment(); + const auto tail_size = work_amount % increment; + const auto need_tail = tail_size != 0; + const auto need_vector_loop = work_amount >= increment; + // Note, that finalization_offsets could be modified inside optimize_single_evaluation, + // so need to save them here to cover (evaluate_once vector with non-zero finalization_offsets + tail) + std::vector tail_finalization_offsets = need_tail ? vector_loop_end->get_finalization_offsets() : std::vector {}; + // vector loops are required => Just copy the body, original loop is already a vector one + if (need_vector_loop) { + // Note that finalization offsets should be applied after the last iteration. + // So if there is a tail, then we should apply offsets after it, but not now. + if (need_tail) + vector_loop_end->set_finalization_offsets(std::vector(tail_finalization_offsets.size(), 0)); - OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D") - // wrapping into tiles2D - auto tile_scheduler = std::make_shared(vector_region, scalar_region); - tile_scheduler->compile_params = compile_params; - const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler), - std::make_pair(std::vector({in, out, target->get_lanes()}), std::vector{})); + if (config.m_optimize_single_evaluation) { + // force ptr increments if there is tail + optimize_single_evaluation(vector_loop_end, need_tail); + } + + lower_ops(vector_loop); + } + OV_ITT_TASK_NEXT(GENERATE, "::TailLoop") + // tail is required => transform the body into a tail representation + // tail loop is fake loop because for tail we should calculate only + // finalization offsets which are supported by LoopEnd. + if (need_tail) { + NodeMap vector_to_tail_node_map; + tail_loop = ngraph::clone_nodes(vector_loop, vector_to_tail_node_map); + tail_transformations(tail_loop, tail_size, config); + tail_loop_end = ov::as_type_ptr(*tail_loop.rbegin()); + tail_loop_end->set_finalization_offsets(tail_finalization_offsets); + tail_loop_end->set_increment(tail_size); + // ptr increments were set to the old increment, need to update them in accordance with the new one + tail_loop_end->update_ptr_increments(static_cast(tail_size)); + tail_loop_end->set_work_amount(tail_size); + tail_loop_end->has_outer_loop = vector_loop_end->has_outer_loop; + + if (config.m_optimize_single_evaluation) { + // tail loop is always executed once + optimize_single_evaluation(tail_loop_end); + } + + lower_ops(tail_loop); + } + } else { + lower_ops({*op}); + } + } OV_ITT_TASK_NEXT(GENERATE, "::EmitCode") - // emission - auto tiles2DKernel = std::make_shared(std::vector {tile_scheduler_region}); - tiles2DKernel->compile_params = compile_params; - std::shared_ptr kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel); - kernel->emit_code({in, out}, {}); + //todo: Kernel need info on i/o data access pattern and data shapes to calculate data offsets + // pass Params and Results + // todo: it's probably better to move AllocaledEmitter creation inside Kernel constructor + // So Kernel accepts only model ptr and target, and creates AllocatedEmitter inside + //emission + auto loops2DKernel = std::make_shared(lowered, m); + loops2DKernel->compile_params = compile_params; + std::shared_ptr kernel = target->get(op::Kernel::get_type_info_static())(loops2DKernel); + + kernel->emit_code({}, {}); + OV_ITT_TASK_NEXT(GENERATE, "::EmitData") - lowered.insert(lowered.end(), scalar_lowered.begin(), scalar_lowered.end()); for (auto& op : lowered) { op.first->emit_data(); } OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet") + + // todo: we save lowered to access compiled brgemm kernels on execution time (normally lowered is destructed by then) + // remove this when kernel caching is implemented. Don't forget to make generate const method. + if (config.m_save_lowered_code) + lowered_saved = lowered; + return target->get_snippet(); } + +std::shared_ptr Generator::get_target_machine() const { + return target; +} + +}// namespace snippets +}// namespace ngraph diff --git a/src/common/snippets/src/op/brgemm.cpp b/src/common/snippets/src/op/brgemm.cpp new file mode 100644 index 00000000000..7bf999cb15e --- /dev/null +++ b/src/common/snippets/src/op/brgemm.cpp @@ -0,0 +1,64 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" +#include "snippets/op/brgemm.hpp" +#include "ngraph/runtime/host_tensor.hpp" +#include "openvino/core/rt_info.hpp" +#include "snippets/utils.hpp" +#include "matmul_shape_inference.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +Brgemm::Brgemm(const Output& A, const Output& B, const size_t offset_a, const size_t offset_b, const size_t offset_c) + : MatMul(), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) { + set_arguments({A, B}); + set_output_size(1); + constructor_validate_and_infer_types(); +} + +bool Brgemm::visit_attributes(AttributeVisitor& visitor) { + MatMul::visit_attributes(visitor); + visitor.on_attribute("offset_a", m_offset_a); + visitor.on_attribute("offset_b", m_offset_b); + visitor.on_attribute("offset_c", m_offset_c); + return true; +} + +void Brgemm::validate_and_infer_types() { + INTERNAL_OP_SCOPE(Brgemm_validate_and_infer_types); + element::Type result_et; + NODE_VALIDATION_CHECK(this, + element::Type::merge(result_et, get_input_element_type(0), get_input_element_type(1)), + "Arguments do not have the same element type (arg0 element type: ", + get_input_element_type(0), + ", arg1 element type: ", + get_input_element_type(1), + ")."); + // If no leading dimensions are provided, assume dense row-major inputs-outputs + NODE_VALIDATION_CHECK(this, get_input_partial_shape(0).is_static() && get_input_partial_shape(1).is_static(), + "Brgemm currently supports only static shapes."); + + std::vector planar_input_shapes; + for (const auto& in : input_values()) + planar_input_shapes.emplace_back(utils::get_port_planar_shape(in)); + + std::vector output_shapes = {ov::PartialShape{}}; + ov::op::v0::shape_infer(this, planar_input_shapes, output_shapes); + const auto& output_layout = utils::get_node_output_layout(this); + output_shapes[0] = utils::get_reordered_planar_shape(output_shapes[0], output_layout); + set_output_type(0, result_et, output_shapes[0]); +} + +std::shared_ptr Brgemm::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Brgemm_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), new_args.at(1), m_offset_a, m_offset_b, m_offset_c); +} + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/op/broadcastload.cpp b/src/common/snippets/src/op/broadcastload.cpp index 7d8dd32cafb..0f4e6c7667e 100644 --- a/src/common/snippets/src/op/broadcastload.cpp +++ b/src/common/snippets/src/op/broadcastload.cpp @@ -11,15 +11,21 @@ using namespace std; using namespace ngraph; -snippets::op::BroadcastLoad::BroadcastLoad(const Output& x, Shape shape) -: BroadcastMove(x, shape) { +snippets::op::BroadcastLoad::BroadcastLoad(const Output& x, ov::PartialShape shape, size_t offset) + : BroadcastMove(x, std::move(shape)), m_offset(offset) { constructor_validate_and_infer_types(); } +bool snippets::op::BroadcastLoad::visit_attributes(AttributeVisitor& visitor) { + BroadcastMove::visit_attributes(visitor); + visitor.on_attribute("offset", m_offset); + return true; +} + std::shared_ptr snippets::op::BroadcastLoad::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(BroadcastLoad); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), output_shape); + return std::make_shared(new_args.at(0), output_shape, m_offset); } void snippets::op::BroadcastLoad::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/broadcastmove.cpp b/src/common/snippets/src/op/broadcastmove.cpp index 1a0d300ff5c..17910d3c642 100644 --- a/src/common/snippets/src/op/broadcastmove.cpp +++ b/src/common/snippets/src/op/broadcastmove.cpp @@ -12,7 +12,7 @@ using namespace std; using namespace ngraph; -snippets::op::BroadcastMove::BroadcastMove(const Output& x, Shape shape) : Op({x}), output_shape(shape) { +snippets::op::BroadcastMove::BroadcastMove(const Output& x, ov::PartialShape shape) : Op({x}), output_shape(std::move(shape)) { constructor_validate_and_infer_types(); } @@ -24,44 +24,9 @@ bool snippets::op::BroadcastMove::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr snippets::op::BroadcastMove::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(BroadcastMove); check_new_args_count(this, new_args); - auto other = std::make_shared(new_args.at(0), this->output_shape); - return other; + return std::make_shared(new_args.at(0), output_shape); } void snippets::op::BroadcastMove::validate_and_infer_types() { set_output_type(0, get_input_element_type(0), this->output_shape); -} - -bool snippets::op::BroadcastMove::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const { - INTERNAL_OP_SCOPE(BroadcastMove); - NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config"); - NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config"); - NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation"); - NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - - auto ishape = input_values[0]->get_shape(); - auto oshape = output_values[0]->get_shape(); - - NGRAPH_CHECK(ishape.size() == oshape.size(), "input and output should have the same rank"); - - AxisSet broadcast_axes; - for (size_t k = 0; k < ishape.size(); k++) { - if (!((ishape[k] == oshape[k]) - || (ishape[k] != oshape[k] && ((ishape[k] == 1) != (oshape[k] == 1) ) ))) { - throw ngraph_error("FakeBroadcast::evaluate incompatible shapes"); - } - - if (ishape[k] != oshape[k]) { - broadcast_axes.insert(k); - } - } - - runtime::reference::broadcast(input_values[0]->get_data_ptr(), - output_values[0]->get_data_ptr(), - input_values[0]->get_shape(), - output_values[0]->get_shape(), - broadcast_axes, - sizeof(float)); - return true; -} +} \ No newline at end of file diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp new file mode 100644 index 00000000000..ad05ae2e046 --- /dev/null +++ b/src/common/snippets/src/op/buffer.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/buffer.hpp" +#include "snippets/snippets_isa.hpp" + +#include + +using namespace std; +using namespace ngraph; + +auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t { + return allocation_rank < 0 ? allocation_rank + static_cast(shape_rank) : allocation_rank; +} + +snippets::op::Buffer::Buffer(const Output& x, const int32_t allocation_rank) : Op({x}), m_allocation_rank(allocation_rank) { + constructor_validate_and_infer_types(); +} + +bool snippets::op::Buffer::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(Buffer_visit_attributes); + visitor.on_attribute("allocation_rank", m_allocation_rank); + return true; +} + +std::shared_ptr snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs); + check_new_args_count(this, new_args); + auto new_buffer = std::make_shared(new_args.at(0), m_allocation_rank); + return new_buffer; +} + +void snippets::op::Buffer::validate_and_infer_types() { + INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types); + const auto shape_rank = get_input_partial_shape(0).rank(); + if (shape_rank.is_static()) { + const auto normalized_rank = normalize_rank(m_allocation_rank, shape_rank.get_length()); + NGRAPH_CHECK(normalized_rank >= 0 && normalized_rank <= shape_rank.get_length(), + "Buffer has incorrect allocation rank: " + std::to_string(m_allocation_rank)); + } + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + +size_t ngraph::snippets::op::Buffer::get_byte_size() const { + const auto pshape = get_input_partial_shape(0); + NGRAPH_CHECK(pshape.is_static(), "Buffer should have static shapes for memory allocation"); + const auto shape = pshape.get_shape(); + const auto normalized_rank = normalize_rank(m_allocation_rank, shape.size()); + return ngraph::shape_size(shape.rbegin(), shape.rbegin() + normalized_rank) * get_element_type().size(); +} diff --git a/src/common/snippets/src/op/fill.cpp b/src/common/snippets/src/op/fill.cpp new file mode 100644 index 00000000000..ac93a501aad --- /dev/null +++ b/src/common/snippets/src/op/fill.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/fill.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::Fill::Fill(const Output& x, const size_t offset, const uint32_t fill_value) + : Op({x}), m_offset(offset), m_fill_value(fill_value) { + constructor_validate_and_infer_types(); +} + +bool snippets::op::Fill::visit_attributes(AttributeVisitor& visitor) { + INTERNAL_OP_SCOPE(Fill_visit_attributes); + visitor.on_attribute("offset", m_offset); + visitor.on_attribute("fill_value", m_fill_value); + return true; +} + +std::shared_ptr snippets::op::Fill::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(Fill_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_offset, m_fill_value); +} + +void snippets::op::Fill::validate_and_infer_types() { + INTERNAL_OP_SCOPE(Fill_validate_and_infer_types); + const auto in_type = get_input_element_type(0); + NGRAPH_CHECK(in_type.size() == 4, "Fill operation supports only element types with 4 byte size but got:" + std::to_string(in_type.size())); + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + diff --git a/src/common/snippets/src/op/horizon_max.cpp b/src/common/snippets/src/op/horizon_max.cpp new file mode 100644 index 00000000000..37e6e3f3c55 --- /dev/null +++ b/src/common/snippets/src/op/horizon_max.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/op/horizon_max.hpp" + +using namespace std; +using namespace ngraph; + +snippets::op::HorizonMax::HorizonMax(const Output& x) : Op({x}) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::HorizonMax::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(HorizonMax_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0)); +} + +void snippets::op::HorizonMax::validate_and_infer_types() { + INTERNAL_OP_SCOPE(HorizonMax_validate_and_infer_types); + auto new_shape = get_input_partial_shape(0); + if (!ov::is_scalar(new_shape)) { + new_shape[new_shape.size() - 1] = 1lu; + } + set_output_type(0, get_input_element_type(0), new_shape); +} diff --git a/src/common/snippets/src/op/horizon_sum.cpp b/src/common/snippets/src/op/horizon_sum.cpp new file mode 100644 index 00000000000..fa791dec234 --- /dev/null +++ b/src/common/snippets/src/op/horizon_sum.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/op/horizon_sum.hpp" + +using namespace std; +using namespace ngraph; + +snippets::op::HorizonSum::HorizonSum(const Output& x) : Op({x}) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::HorizonSum::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(HorizonSum_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0)); +} + +void snippets::op::HorizonSum::validate_and_infer_types() { + INTERNAL_OP_SCOPE(HorizonSum_validate_and_infer_types); + auto new_shape = get_input_partial_shape(0); + if (!ov::is_scalar(new_shape)) { + new_shape[new_shape.size() - 1] = 1lu; + } + set_output_type(0, get_input_element_type(0), new_shape); +} diff --git a/src/common/snippets/src/op/kernel.cpp b/src/common/snippets/src/op/kernel.cpp index aebca7edd3a..7003d3ba28c 100644 --- a/src/common/snippets/src/op/kernel.cpp +++ b/src/common/snippets/src/op/kernel.cpp @@ -5,8 +5,14 @@ #include "snippets/op/kernel.hpp" #include "snippets/generator.hpp" -using namespace std; -using namespace ngraph; +namespace ngraph { +namespace snippets { +namespace op { -snippets::op::Kernel::Kernel(const std::vector, snippets::RegInfo>>& nested) : Op(), region(nested) { +Kernel::Kernel(std::vector nested, std::shared_ptr m) +: Op(), region(std::move(nested)), model(std::move(m)) { } + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index b49d7696fb8..8ee227c7afb 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -8,39 +8,54 @@ #include -using namespace std; -using namespace ngraph; +namespace ngraph { +namespace snippets { +namespace op { -snippets::op::Load::Load(const Output& x, const size_t count) : Op({x}), m_count(count) { +Load::Load(const Output& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) { constructor_validate_and_infer_types(); } -bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) { - return true; -} - -std::shared_ptr snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const { +std::shared_ptr Load::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Load); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count); + return std::make_shared(new_args.at(0), m_count, m_offset); } -void snippets::op::Load::validate_and_infer_types() { - set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); + +LoadReshape::LoadReshape(const Output& x, const size_t count, const size_t offset, std::vector order) + : Load(x, count, offset), m_order(std::move(order)) { + const auto& in_shape = x.get_partial_shape(); + NGRAPH_CHECK(in_shape.is_static(), "LoadReshape supports only static input shapes"); + const auto in_shape_size = in_shape.size(); + NGRAPH_CHECK(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size"); + NGRAPH_CHECK(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 && + *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order"); + const std::set unique_dims(order.begin(), order.end()); + NGRAPH_CHECK(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements"); + constructor_validate_and_infer_types(); } -bool snippets::op::Load::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const { - INTERNAL_OP_SCOPE(Load); - NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config"); - NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config"); - NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation"); - NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - - std::copy(input_values[0]->get_data_ptr(), - input_values[0]->get_data_ptr() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(), - output_values[0]->get_data_ptr()); +void snippets::op::LoadReshape::validate_and_infer_types() { + const auto& old_shape = get_input_partial_shape(0); + ov::PartialShape new_shape; + for (const auto idx : m_order) + new_shape.push_back(old_shape[idx]); + set_output_type(0, get_input_element_type(0), new_shape); +} +bool snippets::op::LoadReshape::visit_attributes(AttributeVisitor& visitor) { + Load::visit_attributes(visitor); + visitor.on_attribute("order", m_order); return true; } + +std::shared_ptr snippets::op::LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(LoadReshape); + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_count, m_offset, m_order); +} + +}// namespace op +}// namespace snippets +}// namespace ngraph diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp new file mode 100644 index 00000000000..e1a4de9fef8 --- /dev/null +++ b/src/common/snippets/src/op/loop.cpp @@ -0,0 +1,182 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/op/loop.hpp" +#include "snippets/generator.hpp" + +using namespace std; +namespace ngraph { +namespace snippets { +namespace op { + +LoopBase::LoopBase(const std::vector> &args, size_t work_amount, size_t increment) + : Op(args), work_amount(work_amount), work_amount_increment(increment), evaluate_once(false) { +} + +bool LoopBase::visit_attributes(AttributeVisitor &visitor) { + visitor.on_attribute("work_amount", work_amount); + visitor.on_attribute("increment", work_amount_increment); + return true; +} + +size_t LoopBase::get_work_amount() const { + return work_amount; +} + +bool LoopBase::get_evaluate_once() const { + return evaluate_once; +} + +size_t LoopBase::get_increment() const { + return work_amount_increment; +} + +LoopBegin::LoopBegin(const std::vector> &args, size_t work_amount, size_t work_amount_increment) + : LoopBase(args, work_amount, work_amount_increment), + begin_address(nullptr), input_regs({}) { + // We can only call a reduced validate_and_infer types from the constructor, since LoopEnd might not be attached + // to the LoopBegin at this point (which is usually the case: create LoopBegin first => then attach LoopEnd to it) + validate_and_infer_types_except_LoopEnd(); +} + +LoopBegin::LoopBegin(const std::vector> &args) + : LoopBase(args, 0, 0), begin_address(nullptr), input_regs({}) { + validate_and_infer_types_except_LoopEnd(); +} + +std::shared_ptr LoopBegin::clone_with_new_inputs(const OutputVector& inputs) const { + return std::shared_ptr(new LoopBegin(inputs, work_amount, work_amount_increment)); +} + + +void LoopBegin::validate_and_infer_types_except_LoopEnd() { + const size_t num_inputs = get_input_size(); + set_output_size(num_inputs + 1); + // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd + for (int i = 0; i < num_inputs; i++) + get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr()); + set_output_type(num_inputs, element::f32, ov::PartialShape{ov::Shape{}}); +} + +void LoopBegin::validate_and_infer_types() { + validate_and_infer_types_except_LoopEnd(); + const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs(); + NODE_VALIDATION_CHECK(this, last_output_inputs.size() == 1, "LoopBegin must have exactly one input attached to the last output"); + const auto& loop_end = ov::as_type_ptr(last_output_inputs.begin()->get_node()->shared_from_this()); + NODE_VALIDATION_CHECK(this, loop_end != nullptr, "LoopBegin must have LoopEnd connected to its last output"); + work_amount = loop_end->get_work_amount(); + work_amount_increment = loop_end->get_increment(); +} + +std::shared_ptr LoopBegin::get_loop_end() { + const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs(); + if (last_output_inputs.size() != 1) + throw std::invalid_argument("LoopBegin has more than one inputs attached to the last output"); + const auto& loop_end = ov::as_type_ptr(last_output_inputs.begin()->get_node()->shared_from_this()); + if (!loop_end) + throw std::invalid_argument("LoopBegin last output is not connected to LoopEnd"); + return loop_end; +} + +LoopEnd::LoopEnd(const std::vector> &args, size_t work_amount, size_t work_amount_increment, + std::vector apply_increments, std::vector finalization_offsets) + : LoopBase(args, work_amount, work_amount_increment), finalization_offsets(std::move(finalization_offsets)), + has_outer_loop(true), loop_io_size(0) { + ptr_increments.resize(apply_increments.size()); + std::transform(apply_increments.begin(), apply_increments.end(), ptr_increments.begin(), + [work_amount_increment](bool apply) { + return apply ? work_amount_increment : 0; + }); + constructor_validate_and_infer_types(); +} + +LoopEnd::LoopEnd(const std::vector> &args, size_t work_amount, size_t work_amount_increment, + std::vector ptr_increments, std::vector finalization_offsets) + : LoopBase(args, work_amount, work_amount_increment), ptr_increments(std::move(ptr_increments)), + finalization_offsets(std::move(finalization_offsets)), has_outer_loop(true), loop_io_size(0) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const { + return std::make_shared(inputs, work_amount, work_amount_increment, ptr_increments, finalization_offsets); +} + +std::shared_ptr LoopEnd::get_loop_begin() { + const auto& loop_begin = ov::as_type_ptr(get_input_source_output(get_input_size() - 1).get_node_shared_ptr()); + if (!loop_begin) + throw std::invalid_argument("LoopEnd last input is not connected to LoopBegin"); + return loop_begin; +} + +const std::vector& LoopEnd::get_finalization_offsets() const { + return finalization_offsets; +} + +const std::vector& LoopEnd::get_ptr_increments()const { + return ptr_increments; +} + +void LoopEnd::set_finalization_offsets(std::vector offsets) { + if (offsets.size() != loop_io_size) + throw std::invalid_argument("LoopEnd set_finalization_offsets is called with inconsistent offsets.size()"); + finalization_offsets = std::move(offsets); +} + +void LoopEnd::set_ptr_increments(std::vector new_ptr_increments) { + if (new_ptr_increments.size() != loop_io_size) + throw std::invalid_argument("LoopEnd set_ptr_increments is called with inconsistent new_ptr_increments.size()"); + ptr_increments = std::move(new_ptr_increments); +} + +void LoopEnd::update_ptr_increments(int64_t new_increment) { + std::transform(ptr_increments.begin(), ptr_increments.end(), ptr_increments.begin(), + [new_increment](int64_t old_increment){ + return old_increment != 0 ? new_increment : 0; + }); +} + +void LoopEnd::set_work_amount(size_t new_work_amount) { + work_amount = new_work_amount; + // Update LoopBegin to maintain consistency between the Loops + get_loop_begin()->work_amount = new_work_amount; +} + +void LoopEnd::set_increment(size_t new_increment) { + work_amount_increment = new_increment; + // Update LoopBegin to maintain consistency between the Loops + get_loop_begin()->work_amount_increment = new_increment; +} + +void LoopEnd::set_evaluate_once(bool once) { + evaluate_once = once; + // Update LoopBegin to maintain consistency between the Loops + get_loop_begin()->evaluate_once = once; +} + +void LoopEnd::validate_and_infer_types() { + const size_t num_inputs = get_input_size(); + const auto loop_begin = ov::as_type_ptr(input(get_input_size() - 1).get_source_output().get_node_shared_ptr()); + NODE_VALIDATION_CHECK(this, loop_begin != nullptr, "LoopEnd must have LoopBegin as the last argument"); + // Note: have to -2 because the LoopBegin->LoopEnd edge is counted twice + loop_io_size = get_input_size() + loop_begin->get_output_size() - 2; + NODE_VALIDATION_CHECK(this, ptr_increments.empty() || ptr_increments.size() == loop_io_size, + "ptr_increments must be either empty or defined per every input & output of joined Loop. Expected size: ", + loop_io_size, " got ", ptr_increments.size()); + NODE_VALIDATION_CHECK(this, finalization_offsets.empty() || finalization_offsets.size() == loop_io_size, + "finalization_offsets must be either empty or defined per every input & output of joined Loop. Expected size: ", + loop_io_size, " got ", finalization_offsets.size()); + if (ptr_increments.empty()) + ptr_increments.resize(loop_io_size, static_cast(work_amount_increment)); + if (finalization_offsets.empty()) + finalization_offsets.resize(loop_io_size, 0); + set_output_size(num_inputs - 1); + const auto& ins = inputs(); + // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd + for (int i = 0; i < num_inputs - 1; i++) + get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr()); +} + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp new file mode 100644 index 00000000000..2530ea77b63 --- /dev/null +++ b/src/common/snippets/src/op/memory_access.cpp @@ -0,0 +1,45 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/memory_access.hpp" + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +MemoryAccess::MemoryAccess(const Output& x, const size_t count, const size_t offset) : Op({x}), m_count(count), m_offset(offset) {} + +bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("count", m_count); + visitor.on_attribute("offset", m_offset); + return true; +} + +size_t MemoryAccess::get_count() const { + return m_count; +} + +size_t MemoryAccess::get_offset() const { + return m_offset; +} + +void MemoryAccess::set_count(const size_t count) { + m_count = count; +} + +void MemoryAccess::set_offset(const size_t offset) { + m_offset = offset; +} + +void MemoryAccess::validate_and_infer_types() { + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/powerstatic.cpp b/src/common/snippets/src/op/powerstatic.cpp deleted file mode 100644 index cc23b40ac01..00000000000 --- a/src/common/snippets/src/op/powerstatic.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/powerstatic.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -BWDCMP_RTTI_DEFINITION(PowerStatic); - -} // namespace op -} // namespace snippets -} // namespace ngraph diff --git a/src/common/snippets/src/op/scalar.cpp b/src/common/snippets/src/op/scalar.cpp index 16fa33f2f3a..d89ed94b235 100644 --- a/src/common/snippets/src/op/scalar.cpp +++ b/src/common/snippets/src/op/scalar.cpp @@ -6,8 +6,6 @@ using namespace ngraph; -BWDCMP_RTTI_DEFINITION(snippets::op::Scalar); - std::shared_ptr snippets::op::Scalar::clone_with_new_inputs(const OutputVector& new_args) const { check_new_args_count(this, new_args); return std::make_shared(*this); @@ -22,3 +20,13 @@ void snippets::op::Scalar::validate_and_infer_types() { "Scalar supports only one-element constants, got ", out_pshape.get_shape(), " shape"); } + +bool snippets::op::Scalar::visit_attributes(AttributeVisitor& visitor) { + auto shape = get_output_shape(0); + auto type = get_output_element_type(0); + auto value = cast_vector(); + visitor.on_attribute("element_type", type); + visitor.on_attribute("shape", shape); + visitor.on_attribute("value", value); + return true; +} diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp index d75101be0c8..2cee1b20751 100644 --- a/src/common/snippets/src/op/store.cpp +++ b/src/common/snippets/src/op/store.cpp @@ -8,39 +8,19 @@ #include -using namespace std; -using namespace ngraph; +namespace ngraph { +namespace snippets { +namespace op { -snippets::op::Store::Store(const Output& x, const size_t count) : Op({x}), m_count(count) { +snippets::op::Store::Store(const Output& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) { constructor_validate_and_infer_types(); } - -bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) { - return true; -} - std::shared_ptr snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const { - INTERNAL_OP_SCOPE(Store); + INTERNAL_OP_SCOPE(Store_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_count); + return std::make_shared(new_args.at(0), m_count, m_offset); } -void snippets::op::Store::validate_and_infer_types() { - set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); -} - -bool snippets::op::Store::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const { - INTERNAL_OP_SCOPE(Store); - NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config"); - NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config"); - NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation"); - NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - - std::copy(input_values[0]->get_data_ptr(), - input_values[0]->get_data_ptr() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(), - output_values[0]->get_data_ptr()); - - return true; -} +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 4d8fc5ad100..7cfd6a46605 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -9,13 +9,22 @@ #include "snippets/op/convert_saturation.hpp" #include "snippets/pass/insert_load_store.hpp" #include "snippets/pass/insert_movebroadcast.hpp" +#include "snippets/pass/broadcast_to_movebroadcast.hpp" #include "snippets/pass/load_movebroadcast_to_broadcastload.hpp" #include "snippets/pass/assign_registers.hpp" #include "snippets/pass/convert_constants.hpp" #include "snippets/pass/convert_power_to_powerstatic.hpp" #include "snippets/pass/vector_to_scalar.hpp" +#include "snippets/pass/insert_loops.hpp" +#include "snippets/pass/transpose_decomposition.hpp" #include "snippets/pass/transform_convert.hpp" #include "snippets/pass/align_element_type.hpp" +#include "snippets/pass/matmul_to_brgemm.hpp" +#include "snippets/pass/fuse_transpose_brgemm.hpp" +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/pass/reset_buffer.hpp" +#include "snippets/pass/insert_buffer.hpp" +#include "snippets/pass/loop_fusion.hpp" #include "snippets/utils.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" @@ -34,27 +43,43 @@ using namespace std; using namespace ngraph; using namespace ov::op::util; -BWDCMP_RTTI_DEFINITION(snippets::op::Subgraph); - void snippets::op::Subgraph::set_generator(std::shared_ptr generator) { m_generator = generator; } -void snippets::op::Subgraph::set_non_scalar_constants_count(const size_t count) { - m_non_scalar_constants_count = count; +void snippets::op::Subgraph::set_virtual_port_count(const size_t count) { + m_virtual_port_count = count; +} + +void snippets::op::Subgraph::set_buffer_needed(const bool need) { + m_buffer_needed = need; +} + +void snippets::op::Subgraph::init_config() { + const auto ops = body_ptr()->get_ops(); + for (const auto& op : ops) { + config.m_is_quantized = config.m_is_quantized || + ov::is_type(op); + config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || + std::dynamic_pointer_cast(op); + config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || + is_quantized() || + has_type_relaxed_ops() || + snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type); + config.m_has_domain_sensitive_ops = config.m_has_domain_sensitive_ops || + ov::is_type(op) || + ov::is_type(op) || + ov::is_type(op) || + ov::is_type(op); + } + // Domain sensitive ops are decomposed with explicit Loops. So, we should explicitly insert Loops in Subgraph if it contains these ops + config.m_explicit_loop_insertion = config.m_has_domain_sensitive_ops; } snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr body) - : SubGraphOp(args) { + : SubGraphOp(args), m_generator(nullptr) { set_function(body); - const auto ops = body_ptr()->get_ops(); - for (const auto& op : ops) { - config.m_is_quantized = config.m_is_quantized || ov::is_type(op); - config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || std::dynamic_pointer_cast(op); - config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || is_quantized() || has_type_relaxed_ops() || - snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type); - } - + init_config(); constructor_validate_and_infer_types(); for (size_t i = 0; i < body->get_parameters().size(); ++i) m_input_descriptions[0].push_back(std::make_shared(i, i)); @@ -64,13 +89,43 @@ snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr body) - : Subgraph(as_output_vector(args), body) {} + : Subgraph(as_output_vector(args), std::move(body)) {} std::shared_ptr snippets::op::Subgraph::clone_with_new_inputs(const OutputVector& inputs) const { INTERNAL_OP_SCOPE(Subgraph); return make_shared(inputs, ov::clone_model(body())); } +std::vector snippets::op::Subgraph::reshape_body(const std::vector& input_shapes) { + auto& params = body_ptr()->get_parameters(); + OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body"); + for (size_t i = 0; i < params.size(); ++i) { + params[i]->set_partial_shape(input_shapes[i]); + } + body_ptr()->validate_nodes_and_infer_types(); + std::vector output_shapes; + for (const auto& res : body_ptr()->get_results()) { + output_shapes.emplace_back(res->get_input_partial_shape(0)); + } + return output_shapes; +} + +std::vector snippets::op::Subgraph::reshape_body(const std::vector& input_shapes) { + auto& params = body_ptr()->get_parameters(); + OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body"); + for (size_t i = 0; i < params.size(); ++i) { + params[i]->set_partial_shape(input_shapes[i]); + } + body_ptr()->validate_nodes_and_infer_types(); + std::vector output_shapes; + for (const auto& res : body_ptr()->get_results()) { + auto pshape = res->get_input_partial_shape(0); + OPENVINO_ASSERT(pshape.is_static(), "Subgraph inferred dynamic output shape during reshape with static inputs"); + output_shapes.emplace_back(res->get_input_partial_shape(0).get_shape()); + } + return output_shapes; +} + void snippets::op::Subgraph::validate_and_infer_types() { INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::validate_and_infer_types") @@ -111,8 +166,10 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptrinput_values()) { - if ((utils::is_scalar_constant(input.get_node_shared_ptr())) || - (ov::is_type(node) && ov::is_type(input.get_node_shared_ptr()))) { + if (ov::is_type(input.get_node_shared_ptr()) && + (ngraph::shape_size(input.get_shape()) == 1 || + ov::is_type(node) || + constant_input_should_be_inside_body(node))) { body_inputs.push_back(input); } else { auto parameter = std::make_shared(input.get_element_type(), input.get_partial_shape()); @@ -142,9 +199,17 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptrget_friendly_name(), body_results, body_parameters); auto subgraph = build_subgraph(node, subgraph_inputs, body); + bool need_buffer = false; + size_t hidden_data_count = 0lu; if (auto fq_node = ov::as_type_ptr(node)) { - subgraph->set_non_scalar_constants_count(utils::get_non_scalar_constant_count_for_fq(fq_node)); + hidden_data_count += utils::get_non_scalar_constant_count_for_fq(fq_node); + // Ops that requires Buffer + } else if (ov::is_type(node) || + ov::is_type(node)) { + need_buffer |= true; } + subgraph->set_virtual_port_count(hidden_data_count); + subgraph->set_buffer_needed(need_buffer); for (size_t i = 0; i < body->get_parameters().size(); i++) { body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name()); @@ -170,6 +235,13 @@ void snippets::op::Subgraph::fill_empty_output_names(const Output& target_ NGRAPH_SUPPRESS_DEPRECATED_END } +auto snippets::op::Subgraph::constant_input_should_be_inside_body(const std::shared_ptr& node) -> bool { + return ov::is_type(node) || + ov::is_type(node) || + ov::is_type(node) || + ov::is_type(node); +} + /// /// \brief Canonization transforms original subgraph and to canonical form suitable for code generation. In particular, /// it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization @@ -178,7 +250,8 @@ void snippets::op::Subgraph::fill_empty_output_names(const Output& target_ /// * None: all inputs have the same layout /// * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. + /// Also there is precision aligning inside body of subgraph during canonicalization -Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) { +ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, + const BlockedShapeVector& inputShapes) { INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize") NODE_VALIDATION_CHECK(this, inputShapes.size() == body_ptr()->get_parameters().size(), @@ -193,30 +266,29 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape return std::get<0>(lhs).size() < std::get<0>(rhs).size(); }); }; - Shape baseShape; + PartialShape baseShape; AxisVector baseOrder; std::tie(baseShape, baseOrder, std::ignore) = getMaxRankBlockedShape(inputShapes); const auto baseRank = baseShape.size(); const bool baseIsBlocked = baseOrder.size() != std::set(baseOrder.begin(), baseOrder.end()).size(); for (size_t i = 0; i < inputShapes.size(); i++) { const auto &blockedShape = inputShapes[i]; - Shape inShape; + PartialShape inShape; AxisVector inOrder; element::Type inType; std::tie(inShape, inOrder, inType) = blockedShape; const auto inRank = inShape.size(); NODE_VALIDATION_CHECK(this, inRank <= baseRank, "Input rank can't be larger than output rank in snippets."); if (inRank < baseRank) { - Shape newShape(baseRank, 1); + PartialShape newShape(ov::Shape(baseRank, 1)); // todo: more complicated logics is needed if we want to merge smth else than blocked and planar - // could be done by PartialShape::broadcast_merge_into, but this way is faster - size_t startOffset = baseRank - inRank; if (baseIsBlocked) { const bool inIsNotBlocked = inOrder.size() == std::set(inOrder.begin(), inOrder.end()).size(); NODE_VALIDATION_CHECK(this, inIsNotBlocked, "Snippets don't support conversion between blocked layouts of different ranks"); - startOffset--; + inShape.insert(inShape.end(), ov::Dimension(1)); } - std::copy(inShape.begin(), inShape.end(), &newShape[startOffset]); + NODE_VALIDATION_CHECK(this, PartialShape::broadcast_merge_into(newShape, inShape, ov::op::AutoBroadcastType::NUMPY), + "Failed to broadcast_merge inputs in snippets canonicalization"); inShape = std::move(newShape); } else { // todo: 4d blocked + 5d planar layouts are not supported: + @@ -225,55 +297,66 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape "Snippets canonicalization got input shapes of equal ranks but different layouts, which is not supported"); } ov::PartialShape tmpPShape(baseShape); - NODE_VALIDATION_CHECK(this, - PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY), - "Failed to create broadcastable shapes in snippets canonicalization"); - const auto paramShape = body_ptr()->get_parameters()[i]->get_shape(); + // todo: we need to generalize canonicalization for domain-sensitive ops. E.g. MatMul inputs can't be broadcasted one to another + if (!config.m_has_domain_sensitive_ops) + NODE_VALIDATION_CHECK(this, + PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY), + "Failed to create broadcastable shapes in snippets canonicalization"); + const auto paramShape = body_ptr()->get_parameters()[i]->get_partial_shape(); const auto paramType = body_ptr()->get_parameters()[i]->get_element_type(); if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin())) body_ptr()->replace_parameter(i, std::make_shared(paramType, inShape)); } - body_ptr()->validate_nodes_and_infer_types(); - auto skipStartEndOnes = [](const Shape& shape) { + auto skipStartEndOnes = [](const PartialShape& shape) { auto begin = shape.begin(); auto end = shape.end(); while (begin != end && *begin == 1) begin++; while (begin != end && *(end-1) == 1) end--; - Shape trimmedShape(end - begin, 1); + + PartialShape trimmedShape(std::vector (end - begin, 1)); std::copy(begin, end, trimmedShape.begin()); return trimmedShape; }; // Check that output shapes are broadcastable => can be scheduled const auto& body_results = body_ptr()->get_results(); - PartialShape outPShape = body_results[0]->get_shape(); - for (size_t i = 0; i < body_results.size(); i++) { - auto shape_i = body_results[i]->get_shape(); - auto outputShape_i = std::get<0>(outputShapes[i]); - // Check that the produced output shape corresponds to the passed shape - // Some produced shapes may have been changed to be broadcastable (e.g. blocked + planar outputs), - // so we need to remove leading and trailing "1" before the comparison - PartialShape pShape_i(skipStartEndOnes(shape_i)); - bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, skipStartEndOnes(outputShape_i), - ::ngraph::op::AutoBroadcastType::NUMPY); - NODE_VALIDATION_CHECK(this, ov::shape_size(shape_i) == ov::shape_size(outputShape_i) && - compatibleWithPassedShape, "Inferred and passed results shapes are incompatible for snippet ", - get_friendly_name(), " : ", shape_i, " vs ", outputShape_i, "."); - // Check that output shapes are broadcastable to each other => can be scheduled - bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i, - ::ngraph::op::AutoBroadcastType::NUMPY); - NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable"); + PartialShape outPShape = body_results[0]->get_input_partial_shape(0); + // todo: we need a slightly more general approach for backward ROI propagation + const auto& result_parent = body_results[0]->get_input_node_shared_ptr(0); + if (body_results.size() == 1 && + ov::is_type(result_parent) && + ov::is_type(result_parent->get_input_node_shared_ptr(0))) { + outPShape = result_parent->get_input_partial_shape(0); + } else { + for (size_t i = 0; i < body_results.size(); i++) { + auto shape_i = body_results[i]->get_input_partial_shape(0); + auto outputShape_i = std::get<0>(outputShapes[i]); + // Check that the produced output shape corresponds to the passed shape + // Some produced shapes may have been changed to be broadcastable (e.g. blocked + planar outputs), + // so we need to remove leading and trailing "1" before the comparison + PartialShape pShape_i(skipStartEndOnes(shape_i)); + bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, + skipStartEndOnes(outputShape_i), + ::ngraph::op::AutoBroadcastType::NUMPY); + NODE_VALIDATION_CHECK(this, compatibleWithPassedShape, + "Inferred and passed results shapes are incompatible for snippet "); + // Check that output shapes are broadcastable to each other => can be scheduled + bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i, + ::ngraph::op::AutoBroadcastType::NUMPY); + NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, + "Snippets output shapes must be numpy broadcastable"); + } } // We should insert Converts after Parameters and Constant and before Results // to align precision inside Subgraph body that is supported by Plugin align_element_types(outputShapes, inputShapes); - exec_domain = outPShape.get_shape(); - return exec_domain; + master_shape = outPShape; + return master_shape; } void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes, @@ -303,55 +386,209 @@ void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outpu // - Insert Convert before operations that doesn't support original element type for execution // - Insert reverse Convert before operations that support original element type // but have inputs that doesn't support it (because before them will be inserted Convert with exec_type - first point) - // Then we should use ConstantFolding pass to convert element type of Scalars before inference. + // - Then we should use ConstantFolding pass to convert element type of Scalars before inference. + // - Eliminate redundant Converts which can be inserted in AlignElementType() pass ngraph::pass::Manager manager; if (config.m_is_needed_to_align_precision) { manager.register_pass(execution_element_type); manager.register_pass(); + // TODO [100041] : In some cases AlignElementType pass can insert extra Convert because + // the pass doesn't know real precisions in real time. + // We call EliminateConverts pass to remove them + manager.register_pass(); } manager.run_passes(body_ptr()); } +void snippets::op::Subgraph::initialize_buffer_scratchpad_size() { + auto is_transpose_loop = [](const ov::Output& source_output) -> bool { + const auto parent = source_output.get_node_shared_ptr(); + // Transpose op is decomposed into LoopBegin->LoadReshape->Store->LoopEnd subgraph. LoadReshape op can be only + // in Transpose decomposition. So it's enough to verify that this Loop is Transpose pattern. + // We cannot check for non-equality of input and output shape of Transpose Loop because Transpose may have the same + // shapes on input and output. + auto loop_end = ov::as_type_ptr(parent); + if (!loop_end) + return false; + size_t idx = source_output.get_index(); + while (ov::is_type(loop_end->get_input_node_shared_ptr(idx))) { + auto consumer = loop_end->input_value(idx); + idx = consumer.get_index(); + loop_end = ov::as_type_ptr(consumer.get_node_shared_ptr()); + } + + const auto loop_begin = loop_end->get_loop_begin(); + // At the moment Transpose Loops cannot be fused with other Loops, so check for one input and one output is enough + if (loop_begin->get_input_size() != 1 || loop_end->get_output_size() != 1 || loop_begin->get_output_target_inputs(0).size() != 1) + return false; + const auto consumer = loop_begin->get_output_target_inputs(0).begin()->get_node(); + return ov::is_type(consumer); + }; + auto propagate_offset = [](const std::shared_ptr& buffer, const size_t offset) { + // If Buffer has offset We set this offset in the next Load and Store ops + // to correctly read and write data because all buffers have the one register + // Also if user sets offset to a Buffer It means that the Buffer has the corresponding Load and Store ops + + // Propagate to up: in Store. Buffer can have only one Store + { + auto parent = buffer->get_input_node_shared_ptr(0); + auto idx = buffer->input(0).get_source_output().get_index(); + // There may be graph with several LoopBegin and LoopEnd between Store/Brgemm and Buffer, + // so we should iterate through LoopBase + while (ov::is_type(parent)) { + const auto source_output = parent->input_value(idx); + parent = source_output.get_node_shared_ptr(); + idx = source_output.get_index(); + } + if (auto store = ov::as_type_ptr(parent)) { + store->set_offset(offset); + } else if (const auto brgemm = ov::as_type_ptr(parent)) { + // Brgemm encapsulates work with loading and storing of data + brgemm->set_offset_c(offset); + } else { + throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Store op for offset propagation"); + } + } + + // Propagate to down: in Load. Buffer can have several Load and Loops after himself. We should go through all target inputs + { + std::function&)> propagate_down; + propagate_down = [&](const Input& target_input) { + const auto child = target_input.get_node()->shared_from_this(); + // There may be graph with several LoopBegin and LoopEnd between Load/Brgemm and Buffer, + // so we should iterate through LoopBase + // Example: Softmax decomposition with ReduceMax + if (ov::is_type(child)) { + const auto index = target_input.get_index(); + for (const auto loop_target_output : child->output(index).get_target_inputs()) { + propagate_down(loop_target_output); + } + } else if (const auto load = ov::as_type_ptr(child)) { + load->set_offset(offset); + } else if (const auto brgemm = ov::as_type_ptr(child)) { + // Brgemm encapsulates work with loading and storing of data + if (target_input.get_index() == 0) { + brgemm->set_offset_a(offset); + } else if (target_input.get_index() == 1) { + brgemm->set_offset_b(offset); + } + } else { + throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Load op for offset propagation"); + } + }; + + for (const auto target_output : buffer->output(0).get_target_inputs()) { + propagate_down(target_output); + } + } + }; + m_buffer_scratchpad = 0; + size_t offset = 0; + const auto ops = body_ptr()->get_ordered_ops(); + for (const auto& op : ops) { + if (const auto buffer = ov::as_type_ptr(op)) { + const auto buffer_size = buffer->get_byte_size(); + // We need to allocate memory for first buffer at least + if (m_buffer_scratchpad == 0) { + m_buffer_scratchpad += buffer_size; + continue; + } + + // Transpose and MatMul ops should have different memories on inputs and outputs to avoid data corruption, + // so after them, we should allocate new memory. Other operations (Eltwises, Convert) can be executed inplace. + const auto parent = buffer->get_input_node_shared_ptr(0); + if (ov::is_type(parent) || is_transpose_loop(parent)) { + offset = m_buffer_scratchpad; + propagate_offset(buffer, offset); + m_buffer_scratchpad += buffer_size; + continue; + } + + // If Buffer op requires memory size more that has been already allocated, + // we increase current memory size to the needed size + // For example, it's possible when we have a sequence of Eltwise ops with broadcasting + const auto current_allocated_memory_size = m_buffer_scratchpad - offset; + if (buffer_size > current_allocated_memory_size) { + m_buffer_scratchpad += (buffer_size - current_allocated_memory_size); + // Note: we don't update offset because we just add memory to needed size + } + + propagate_offset(buffer, offset); + } + } +} + void snippets::op::Subgraph::convert_to_snippet_dialect() { INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect") auto skip_matching_domain = [](const std::shared_ptr& n) -> bool { - return n->get_input_shape(0).back() != 1; + const auto& pshape = n->get_input_partial_shape(0); + const auto& last_dim = pshape[pshape.size() - 1]; + return last_dim.is_dynamic() || last_dim.get_length() != 1; }; // At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes. // Then we are going to support variadic Load/Store with different element count const size_t count = m_generator->get_target_machine()->get_lanes(); + const auto & params = body_ptr()->get_parameters(); + bool inputs_has_dynamic_last_dims = std::any_of(params.begin(), params.end(), + [](const shared_ptr& p){ + return p->get_partial_shape().rbegin()->is_dynamic(); + }); + const auto allocationRank = static_cast(tileRank); ngraph::pass::Manager manager; + if (config.m_has_domain_sensitive_ops) { + manager.register_pass(); + manager.register_pass(); + manager.register_pass(allocationRank); + manager.register_pass(count, allocationRank); + manager.register_pass(); + } + manager.register_pass(); manager.register_pass(); manager.register_pass(); manager.register_pass(count); manager.register_pass(count); - manager.register_pass(); - manager.register_pass(); - // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for - // simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove - // could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the the output does - // (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced - // with ScalarLoads (ScalarStores) to avoid invalid read in vector Tile. Graph example: - // Parameter_0 Parameter_1 Parameter_2 - // [1,2,5,16] [1,2,5,1] [1,2,5,1] - // Load BroadcastLoad Load* Scalar - // Add Subtract - // \___________ ___________BroadcastMove - // \ / - // Multiply - // Store - // Result - // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile. - if (!exec_domain.empty() && exec_domain.back() != 1) { - manager.register_pass(); - manager.register_pass(); - manager.get_pass_config()-> - set_callback(skip_matching_domain); - manager.get_pass_config()-> - set_callback(skip_matching_domain); + // todo: presently dynamic pipeline is activated even if the last two dimension are static + // In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example) + // should be passed as run-time args, so it's a mixed mode: kernel is shape-aware, but some additional runtime args are required + // Presently Broadcasting is organized in the following way: + // * ALL last dims are static => broadcasting is handled via MoveBroadcast and pointer arithmetics (even for dynamic upper dims) + if (!inputs_has_dynamic_last_dims) { + manager.register_pass(); + manager.register_pass(); + // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for + // simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove + // could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the output does + // (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced + // with ScalarLoads (ScalarStores) to avoid invalid read in vector Loop. Graph example: + // Parameter_0 Parameter_1 Parameter_2 + // [1,2,5,16] [1,2,5,1] [1,2,5,1] + // Load BroadcastLoad Load* Scalar + // Add Subtract + // \___________ ___________BroadcastMove + // \ / + // Multiply + // Store + // Result + // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Loop. + if (master_shape.size() != 0 && master_shape[master_shape.size() - 1] != 1) { + manager.register_pass(); + manager.register_pass(); + manager.get_pass_config()-> + set_callback(skip_matching_domain); + manager.get_pass_config()-> + set_callback(skip_matching_domain); + } + // Note that InsertLoops requires validate_and_infer_types afterwards, so add it manually if + // automatic validation will be disabled in the pass manager + manager.register_pass(master_shape, tileRank, + m_generator->get_target_machine()->get_lanes(), !config.m_explicit_loop_insertion); + if (config.m_has_domain_sensitive_ops) { + manager.register_pass(); + manager.register_pass(); + } } manager.run_passes(body_ptr()); } @@ -380,29 +617,29 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate") NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set"); + convert_to_snippet_dialect(); opt.run_passes(body_ptr()); - // generation flow + // After all passes, when all optimizations are completed and all MemoryAccess ops are inserted, + // we can calculate common buffer scratchpad size and propagate offset from Buffer to the corresponding MemoryAccess ops + if (config.m_has_domain_sensitive_ops) + initialize_buffer_scratchpad_size(); + snippets::pass::AssignRegisters().run_on_model(body_ptr()); - // schedule generation should go here and be target agnostic + const auto ops = body_ptr()->get_ops(); + ngraph::snippets::Generator::GeneratorConfig generatorConfig; + generatorConfig.m_save_lowered_code = config.m_has_domain_sensitive_ops; + generatorConfig.m_need_fill_tail_register = config.m_has_domain_sensitive_ops; + generatorConfig.m_optimize_single_evaluation = std::none_of(ops.begin(), ops.end(), [](const std::shared_ptr& op) { + return ov::is_type(op); + }); // actual code emission - ngraph::snippets::code ptr = m_generator->generate(body_ptr(), compile_params); + ngraph::snippets::code ptr = m_generator->generate(body_ptr(), generatorConfig, compile_params); - // check that body doesn't have constants for scheduling - std::vector> constants; - for (auto op : body_ptr()->get_ordered_ops()) { - if (auto constant = ov::as_type_ptr(op)) { - if (ngraph::shape_size(constant->get_shape()) != 1 && constant->get_shape() != Shape()) { - constants.push_back(constant); - } - } - } - NGRAPH_CHECK(!constants.size(), "External constants detected. Snippet is illigal for scheduling"); - - return {exec_domain, false /*canBeLinearized*/, ptr}; + return {master_shape, false /*canBeLinearized*/, ptr}; } void snippets::op::Subgraph::print() const { diff --git a/src/common/snippets/src/op/tile.cpp b/src/common/snippets/src/op/tile.cpp deleted file mode 100644 index 779df920600..00000000000 --- a/src/common/snippets/src/op/tile.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/tile.hpp" -#include "snippets/generator.hpp" - -using namespace std; -using namespace ngraph; - -snippets::op::Tile::Tile(const std::vector& region, size_t increment, - size_t num_inputs, size_t num_outputs, - const std::vector& io_dims, const std::vector& io_data_sizes) : - Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) { -} diff --git a/src/common/snippets/src/op/tile_scheduler.cpp b/src/common/snippets/src/op/tile_scheduler.cpp deleted file mode 100644 index a613184dc62..00000000000 --- a/src/common/snippets/src/op/tile_scheduler.cpp +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/tile_scheduler.hpp" -#include "snippets/generator.hpp" - -ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region) - : Op(), vector_region{vector_region}, scalar_region{scalar_region} { -} diff --git a/src/common/snippets/src/op/vector_buffer.cpp b/src/common/snippets/src/op/vector_buffer.cpp new file mode 100644 index 00000000000..1be69a6d9ad --- /dev/null +++ b/src/common/snippets/src/op/vector_buffer.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/vector_buffer.hpp" + +#include + +using namespace std; +using namespace ngraph; + +snippets::op::VectorBuffer::VectorBuffer(const ov::element::Type element_type) : Op(), m_element_type(std::move(element_type)) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr snippets::op::VectorBuffer::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(VectorBuffer_clone_with_new_inputs); + check_new_args_count(this, new_args); + return std::make_shared(m_element_type); +} + +void snippets::op::VectorBuffer::validate_and_infer_types() { + INTERNAL_OP_SCOPE(VectorBuffer_validate_and_infer_types); + set_output_type(0, m_element_type, Shape{1lu}); +} diff --git a/src/common/snippets/src/pass/align_element_type.cpp b/src/common/snippets/src/pass/align_element_type.cpp index 2ce3b031aca..469c82ffe22 100644 --- a/src/common/snippets/src/pass/align_element_type.cpp +++ b/src/common/snippets/src/pass/align_element_type.cpp @@ -20,13 +20,17 @@ inline auto is_in_op(const std::shared_ptr& n) -> bool { || ov::is_type(n); } -// At the moment Subgraph supports only Eltwise, Convert and FQ (which is decomposed into Eltwises and Convert) -// And only Eltwises supports execution only in "exec_type". So we can check op type from the opposite +// At the moment Subgraph supports only Eltwise, Select, Convert, Broadcast and FQ (which is decomposed into Eltwises and Convert) with +// Softmax (which is decomposed into Eltwises as well) +// And only Eltwise and Select ops supports execution only in "exec_type". So we can check op type from the opposite // NOTE: This check is only for executable which isn't Parameter/Constant/Result inline auto op_supports_only_exec_type(const std::shared_ptr& n) -> bool { return !is_in_op(n) && !ov::is_type(n) && - !ov::is_type(n); + !ov::is_type(n) && + !ov::is_type(n) && + !ov::is_type(n) && + !ov::is_type(n); } } // namespace @@ -58,7 +62,8 @@ bool ngraph::snippets::pass::AlignElementType::run_on_model(const std::shared_pt // - Input is Convert with unsupported destination type // - Input is Op which support any element type // We couldn't unite these conditions and just check that element type isn't supported exec type - // because we don't call validate_and_infer_types() so we don't know new precisions + // because we don't call validate_and_infer_types() so we don't know new precisions after setting of original + // input and output element types if ((existing_convert && existing_convert->get_destination_type() != exec_type) || (!op_supports_only_exec_type(shared_input))) { insertConvert(op, i, exec_type); @@ -89,6 +94,6 @@ bool ngraph::snippets::pass::AlignElementType::run_on_model(const std::shared_pt } bool ngraph::snippets::pass::AlignElementType::opNeedsAlignElementType(const std::shared_ptr& op, const ov::element::Type exec_type) { - // At the moment Snippets support only Eltwise/Convert/FQ which one output so we can just call get_element_type() + // At the moment Snippets support only Eltwise/Convert/FQ/Select/Softmax/Broadcast which one output so we can just call get_element_type() return op_supports_only_exec_type(op) && op->get_element_type() != exec_type; } diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index 836523ed727..bd864d65f22 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -2,81 +2,208 @@ // SPDX-License-Identifier: Apache-2.0 // -// #include #include -#include "snippets/remarks.hpp" - #include "snippets/pass/assign_registers.hpp" #include "snippets/snippets_isa.hpp" - -#include - #include +namespace { +static constexpr size_t reg_count = 16lu; +} // namespace + bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr& f) { RUN_ON_MODEL_SCOPE(AssignRegisters); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters") using Reg = size_t; + using tensor = std::shared_ptr; auto ops = f->get_ordered_ops(); - decltype(ops) stmts; - std::copy_if(ops.begin(), ops.end(), std::back_inserter(stmts), [](decltype(ops[0]) op) { - return !(std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op)); - }); + // Note that currently there are 3 types of ops: + // * gpr->gpr: (Parameter, Result, LoopBegin, LoopEnd) will also be Buffer? + // * gpr->vec: or vec->gpr Load/LoadConvert, Store/StoreConvert, BroadcastLoad etc. + // * vec->vec: all other "normal" operations that perform calculations on vector registers: Add, BroadcastMove, Power, etc. + enum op_reg_type {gpr2gpr, gpr2vec, vec2gpr, vec2vec}; - size_t rdx = 0; - std::map, Reg> regs; - for (const auto& op : stmts) { - for (const auto& output : op->outputs()) { - regs[output.get_tensor_ptr()] = rdx++; + auto get_op_reg_type = [](const std::shared_ptr& op) { + if (std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) + return gpr2gpr; + else if (std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) + return gpr2vec; + else if (std::dynamic_pointer_cast(op)) + return vec2gpr; + else + return vec2vec; + }; + std::vector>> typed_ops; + for (const auto& op : ops) + typed_ops.emplace_back(std::make_pair(get_op_reg_type(op), op)); + size_t counter_vec = 0; + size_t counter_gpr = 0; + std::map regs_vec, regs_gpr; + // Define a set of immune tensors that will be ignored by auto reg allocation => their reg allocation is done manually + std::map manually_assigned_gprs, manually_assigned_vecs; + const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX; + const auto num_parameters = f->get_parameters().size(); + const auto num_results = f->get_results().size(); + auto accumulator_reg = 0lu; + for (const auto& op : ops) { + if (const auto& param = ov::as_type_ptr(op)) { + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = + static_cast(f->get_parameter_index(param)); + } else if (const auto& result = ov::as_type_ptr(op)) { + // here we use the fact that Result input & output tensors are identical by construction + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = + static_cast(f->get_result_index(result) + num_parameters); + } else if (const auto& buffer = ov::as_type_ptr(op)) { + // All buffers have one common data pointer + manually_assigned_gprs[op->input(0).get_tensor_ptr()] = + static_cast(num_results + num_parameters); + manually_assigned_gprs[op->output(0).get_tensor_ptr()] = + static_cast(num_results + num_parameters); + } else if (ov::is_type(op) || ov::is_type(op)) { + // Only in SoftmaxDecomposition ReduceMax and ReduceSum use HorizonMax/HorizonSum and VectorBuffer. + // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator + // TODO [96351]: We should rewrite accumulator pattern using another way + const auto input = op->get_input_node_shared_ptr(0); // input - it's accumulator math op: Add or Max + for (size_t i = 0; i < input->get_input_size(); ++i) { + if (ov::is_type(input->get_input_node_shared_ptr(i))) { + manually_assigned_vecs[input->input(i).get_tensor_ptr()] = + static_cast(accumulator_reg); + } + } + + manually_assigned_vecs[input->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + manually_assigned_vecs[op->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + + // If there is Broadcast, it should have the same register as Horizon op + // because it's a result of the accumulator as well + for (auto& out : op->output(0).get_target_inputs()) { + const auto child = out.get_node()->shared_from_this(); + if (ov::is_type(child)) { + manually_assigned_vecs[child->output(0).get_tensor_ptr()] = + static_cast(accumulator_reg); + } + } + accumulator_reg++; } } - - std::vector> used; - std::vector> def; - - for (const auto& op : stmts) { - std::set u; - for (const auto& input : op->inputs()) { - if (regs.count(input.get_tensor_ptr())) { - u.insert(regs[input.get_tensor_ptr()]); + auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr& op, + decltype(regs_vec)& reg_map, + const std::map& manually_assigned_regs, + size_t& counter) { + for (const auto& output : op->outputs()) { + const auto& t = output.get_tensor_ptr(); + // Note that some ops might have identical input&output tensors (Result and Tile* for ex.) + // so we have to check that the tensor has not been enumerated already + if (reg_map.count(t) == 0) { + reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG; } } - used.push_back(u); - - std::set d; - if (!std::dynamic_pointer_cast(op)) { - for (const auto& output : op->outputs()) { - d.insert(regs[output.get_tensor_ptr()]); - } + }; + for (const auto& t_op : typed_ops) { + switch (t_op.first) { + case vec2vec: + case gpr2vec: + enumerate_out_tensors(t_op.second, regs_vec, manually_assigned_vecs, counter_vec); + break; + case gpr2gpr: + case vec2gpr: + enumerate_out_tensors(t_op.second, regs_gpr, manually_assigned_gprs, counter_gpr); + break; + } + } + // todo: make one for gpr and one for vector + std::vector> used_gpr(ops.size(), std::set()); // used = used as an input + std::vector> defined_gpr(ops.size(), std::set()); // defined = used as output + std::vector> used_vec(ops.size(), std::set()); + std::vector> defined_vec(ops.size(), std::set()); + + auto tensor2reg = [IS_MANUALLY_ALLOCATED_REG] (const std::vector& tensors, const std::map& reg_map) { + std::set result; + for (const auto& t : tensors) { + if (reg_map.count(t) == 0) + throw ngraph::ngraph_error("Assign registers: attempt to access not enumerated tensor"); + Reg reg_id = reg_map.at(t); + if (reg_id != IS_MANUALLY_ALLOCATED_REG) + result.insert(reg_id); + } + return result; + }; + for (size_t i = 0; i < typed_ops.size(); i++) { + const auto& t_op = typed_ops[i]; + std::vector used_tensors, defined_tensors; + for (const auto& in : t_op.second->inputs()) + used_tensors.push_back(in.get_tensor_ptr()); + for (const auto& out : t_op.second->outputs()) + defined_tensors.push_back(out.get_tensor_ptr()); + switch (t_op.first) { + case vec2vec: + used_vec[i] = tensor2reg(used_tensors, regs_vec); + defined_vec[i] = tensor2reg(defined_tensors, regs_vec); + break; + case gpr2gpr: + used_gpr[i] = tensor2reg(used_tensors, regs_gpr); + defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr); + break; + case gpr2vec: + used_gpr[i] = tensor2reg(used_tensors, regs_gpr); + defined_vec[i] = tensor2reg(defined_tensors, regs_vec); + break; + case vec2gpr: + used_vec[i] = tensor2reg(used_tensors, regs_vec); + defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr); + break; } - def.push_back(d); } // define life intervals - std::vector> lifeIn(stmts.size(), std::set()); - std::vector> lifeOut(stmts.size(), std::set()); + // liveOut[i] - regs that are live on exit from i-th (topologically ordered) operation + // liveIn[i] - regs that are live on entering the i-th (topologically ordered) operation + std::vector> life_in_vec(std::move(used_vec)); + std::vector> life_out_vec(typed_ops.size(), std::set()); + std::vector> life_in_gpr(std::move(used_gpr)); + std::vector> life_out_gpr(typed_ops.size(), std::set()); - for (size_t i = 0; i < stmts.size(); i++) { - for (size_t n = 0; n < stmts.size(); n++) { - std::set_difference(lifeOut[n].begin(), lifeOut[n].end(), def[n].begin(), def[n].end(), std::inserter(lifeIn[n], lifeIn[n].begin())); - lifeIn[n].insert(used[n].begin(), used[n].end()); + // todo: this part if O(N*N), so it's slow for large subgraphs. Can we simplify it? At least add an early stopping criteria + for (size_t i = 0; i < typed_ops.size(); i++) { + for (size_t n = 0; n < typed_ops.size(); n++) { + // Regs that are live on entering the operation = regs used by the op + (all other regs alive - regs defined by the op) + // copy regs from lifeOut to lifeIn while ignoring regs in def + std::set_difference(life_out_gpr[n].begin(), life_out_gpr[n].end(), + defined_gpr[n].begin(), defined_gpr[n].end(), + std::inserter(life_in_gpr[n], life_in_gpr[n].begin())); + std::set_difference(life_out_vec[n].begin(), life_out_vec[n].end(), + defined_vec[n].begin(), defined_vec[n].end(), + std::inserter(life_in_vec[n], life_in_vec[n].begin())); } - for (size_t n = 0; n < stmts.size(); n++) { - auto node = stmts[n]; - if (!std::dynamic_pointer_cast(node)) { - for (const auto& out : node->outputs()) { - for (const auto& port : out.get_target_inputs()) { - auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this()); - if (pos != stmts.end()) { - auto k = pos-stmts.begin(); - lifeOut[n].insert(lifeIn[k].begin(), lifeIn[k].end()); - } + for (size_t n = 0; n < typed_ops.size(); n++) { + auto op = typed_ops[n].second; + for (const auto& out : op->outputs()) { + for (const auto& port : out.get_target_inputs()) { + auto k = std::find(ops.begin(), ops.end(), port.get_node()->shared_from_this()) - ops.begin(); + if (k == ops.size()) + throw ngraph_error("assign registers can't find target op in the body"); + switch (typed_ops[k].first) { + case vec2vec: + case vec2gpr: + life_out_vec[n].insert(life_in_vec[k].begin(), life_in_vec[k].end()); + break; + case gpr2gpr: + case gpr2vec: + life_out_gpr[n].insert(life_in_gpr[k].begin(), life_in_gpr[k].end()); + break; } } } } } - struct by_starting { auto operator()(const std::pair& lhs, const std::pair& rhs) const -> bool { return lhs.first < rhs.first|| (lhs.first == rhs.first && lhs.second < rhs.second); @@ -88,13 +215,15 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr return lhs.second < rhs.second || (lhs.second == rhs.second && lhs.first < rhs.first); } }; + // A variable live interval - is a range (start, stop) of op indexes, such that + // the variable is alive within this range (defined but not used by the last user) + std::map, Reg, by_starting> live_intervals_vec, live_intervals_gpr; - std::set, by_starting> live_intervals; - - std::reverse(lifeIn.begin(), lifeIn.end()); - auto find_last_use = [lifeIn](int i) -> int { - int ln = static_cast(lifeIn.size()) - 1; - for (auto& x : lifeIn) { + std::reverse(life_in_vec.begin(), life_in_vec.end()); + std::reverse(life_in_gpr.begin(), life_in_gpr.end()); + auto find_last_use = [](decltype(life_in_gpr) life_in, int i) -> int { + int ln = static_cast(life_in.size()) - 1; + for (auto& x : life_in) { if (x.find(i) != x.end()) { return ln; } @@ -102,67 +231,86 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr } return i; }; - - for (size_t i = 0; i < stmts.size(); i++) { - live_intervals.insert(std::make_pair(static_cast(i), find_last_use(static_cast(i)))); + for (int i = 0; i < static_cast(typed_ops.size()); i++) { + for (const auto& def : defined_vec[i]) + live_intervals_vec[std::make_pair(i, find_last_use(life_in_vec, static_cast(def)))] = def; + for (const auto& def : defined_gpr[i]) + live_intervals_gpr[std::make_pair(i, find_last_use(life_in_gpr, static_cast(def)))] = def; } - // http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf - std::multiset, by_ending> active; - std::map register_map; - std::stack bank; - for (int i = 0; i < 16; i++) bank.push(16-1-i); + auto linescan_assign_registers = [](const decltype(live_intervals_vec)& live_intervals, + const std::set& reg_pool) { + // http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf + // todo: do we need multimap? <=> can an op have two inputs from the same op? + std::map, Reg, by_ending> active; + // uniquely defined register => reused reg (reduced subset enabled by reg by reusage) + std::map register_map; + std::stack bank; + // regs are stored in ascending order in reg_pool, so walk in reverse to assign them the same way + for (auto rit = reg_pool.crbegin(); rit != reg_pool.crend(); rit++) + bank.push(*rit); - for (auto interval : live_intervals) { - // check expired - while (!active.empty()) { - auto x = *active.begin(); - if (x.second >= interval.first) { - break; + std::pair interval, active_interval; + Reg unique_reg, active_unique_reg; + for (const auto& interval_reg : live_intervals) { + std::tie(interval, unique_reg) = interval_reg; + // check expired + while (!active.empty()) { + std::tie(active_interval, active_unique_reg) = *active.begin(); + // if end of active interval has not passed yet => stop removing actives since they are sorted by end + if (active_interval.second >= interval.first) { + break; + } + active.erase(active_interval); + bank.push(register_map[active_unique_reg]); } - active.erase(x); - bank.push(register_map[x.first]); - } - // allocate - if (active.size() == 16) { - throw ngraph_error("caanot allocate registers for a snippet "); - } else { - register_map[interval.first] = bank.top(); - bank.pop(); - active.insert(interval); - } - } - - std::map, Reg> physical_regs; - - for (const auto& reg : regs) { - physical_regs[reg.first] = register_map[reg.second]; - } - const auto num_parameters = f->get_parameters().size(); - for (const auto& n : f->get_ordered_ops()) { - auto& rt = n->get_rt_info(); - std::vector regs; - regs.reserve(n->outputs().size()); - /* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are - * then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details. - * Note also that Parameter and Result store general-purpose register index, because they work with memory - * (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are - * performed on registers. - */ - if (is_type(n)) { - continue; - } else if (const auto& param = ov::as_type_ptr(n)) { - regs.push_back(f->get_parameter_index(param)); - } else if (const auto& store = ov::as_type_ptr(n)) { - regs.push_back(f->get_result_index(store) + num_parameters); - } else { - for (const auto& output : n->outputs()) { - auto allocated = physical_regs[output.get_tensor_ptr()]; - regs.push_back(allocated); + // allocate + if (active.size() == reg_pool.size()) { + // todo: if it is LoopBegin or LoopEnd that requires gpr, and we don't have any in the pool, + // then assign SIZE_MAX-1 as a flag to spill a reg inside emitter + throw ngraph::ngraph_error("can't allocate registers for a snippet "); + } else { + register_map[unique_reg] = bank.top(); + bank.pop(); + active.insert(interval_reg); } } - rt["reginfo"] = regs; - } + return register_map; + }; + // todo: vec_/gpr_pool are hardware-specific and should be provided by a backend, e.g. overloaded generator + std::set vec_pool; + for (Reg i = 0; i < reg_count; i++) + vec_pool.insert(i); + std::set gpr_pool(vec_pool); + for (const auto& t_reg : manually_assigned_vecs) + vec_pool.erase(t_reg.second); + for (const auto& t_reg : manually_assigned_gprs) + gpr_pool.erase(t_reg.second); + auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool); + auto unique2reused_map_gpr = linescan_assign_registers(live_intervals_gpr, gpr_pool); + std::map assigned_regs(std::move(manually_assigned_gprs)); + assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end()); + auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map& unique_regs, + const std::map& unique2reused) { + for (const auto& reg : unique_regs) { + if (reg.second == IS_MANUALLY_ALLOCATED_REG) + continue; + if (unique2reused.count(reg.second) == 0) + throw ngraph::ngraph_error("Assign registers failed to allocate register for a tensor"); + assigned_regs[reg.first] = unique2reused.at(reg.second); + } + }; + register_assigned_regs(regs_vec, unique2reused_map_vec); + register_assigned_regs(regs_gpr, unique2reused_map_gpr); + + for (const auto& t_op : typed_ops) { + for (const auto& out : t_op.second->outputs()) { + const auto& t = out.get_tensor_ptr(); + auto& rt = t->get_rt_info(); + rt["reginfo"] = static_cast(assigned_regs[t]); + } + } return false; } + diff --git a/src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp b/src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp new file mode 100644 index 00000000000..d6e16633ba8 --- /dev/null +++ b/src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/pass/broadcast_to_movebroadcast.hpp" +#include "snippets/pass/insert_movebroadcast.hpp" +#include + +#include +#include + +using namespace ngraph; + +ngraph::snippets::pass::BroadcastToMoveBroadcast::BroadcastToMoveBroadcast() { + MATCHER_SCOPE(BroadcastToMoveBroadcast); + + auto m_broadcast = ngraph::pattern::wrap_type(); + + auto callback = [this](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::BroadcastToMoveBroadcast") + auto root = m.get_match_root(); + if (auto broadcast_v1 = ov::as_type_ptr(root)) { + if (broadcast_v1->get_broadcast_spec().m_type != ov::op::AutoBroadcastType::NUMPY) + return false; + } else if (auto broadcast_v3 = ov::as_type_ptr(root)) { + if (broadcast_v3->get_broadcast_spec().m_type != ov::op::BroadcastType::NUMPY) + return false; + } + + const auto target_shape = root->get_output_partial_shape(0); + const auto value_shape = root->get_input_partial_shape(0); + if (target_shape.is_dynamic() || value_shape.is_dynamic()) { + return false; + } + + const auto broadcast_node = ngraph::snippets::pass::InsertMoveBroadcast::BroadcastNodeLastDim(root->input_value(0), + target_shape.get_shape(), + value_shape.get_shape()); + replace_output_update_name(root->output(0), broadcast_node); + ngraph::copy_runtime_info(root, broadcast_node.get_node_shared_ptr()); + + return true; + }; + + auto m = std::make_shared(m_broadcast, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index 0e7f7e1a402..b348ccb85e9 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -6,6 +6,9 @@ #include #include "snippets/pass/collapse_subgraph.hpp" +#include "snippets/pass/tokenization.hpp" +#include "snippets/pass/transpose_decomposition.hpp" +#include "snippets/pass/fuse_transpose_brgemm.hpp" #include "snippets/op/subgraph.hpp" #include "snippets/utils.hpp" @@ -14,11 +17,11 @@ #include #include #include "transformations/utils/utils.hpp" +#include "ngraph/op/util/attr_types.hpp" #include #include #include -#include #include #include #include @@ -32,33 +35,38 @@ namespace pass { namespace { auto outputs_are_not_broadcastable(const std::shared_ptr& node) -> bool { - auto outputs = node->outputs(); - auto find_smallest_output_shape = [](const std::vector>& outputs) -> Shape { - return std::accumulate(std::begin(outputs), std::end(outputs), ngraph::Shape(outputs.begin()->get_shape()), - [](Shape& other_shape, const Output& output){ - return shape_size(output.get_shape()) < shape_size(other_shape) ? output.get_shape() : other_shape; - }); - }; - auto ref_shape = find_smallest_output_shape(outputs); - - auto check_shapes_broadcastable = [ref_shape](const Output& output) -> bool { - auto other_shape = output.get_shape(); - - if (other_shape.size() != ref_shape.size()) { - return false; - } - - return std::inner_product(std::begin(other_shape), std::end(other_shape), std::begin(ref_shape), true, - std::logical_and(), [](Shape::value_type lsh, Shape::value_type rsh){ - return rsh == 1 || lsh == rsh; - }); - }; - - return std::find_if_not(std::begin(outputs), std::end(outputs), check_shapes_broadcastable) != std::end(outputs); + const auto& outputs = node->outputs(); + if (outputs.size() <= 1) + return false; + ov::PartialShape ref_shape = outputs.front().get_partial_shape(); + bool success = true; + for (int i = 1; i < outputs.size() && success; i++) { + success &= ov::PartialShape::broadcast_merge_into(ref_shape, outputs[i].get_partial_shape(), ov::op::AutoBroadcastType::NUMPY); + } + return !success; } auto is_supported_op(const std::shared_ptr &n) -> bool { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::is_supported_op") + auto is_supported_matmul = [](const std::shared_ptr& n) -> bool { + const auto& matmul = is_type(n); + const auto& out_shape = n->get_output_partial_shape(0); + return matmul && out_shape.is_static() && out_shape.size() == 4; + }; + auto is_supported_transpose = [](const std::shared_ptr& n) -> bool { + const auto& transpose = as_type_ptr(n); + const auto& out_shape = n->get_output_partial_shape(0); + if (transpose && out_shape.is_static()) { + const auto& order = as_type_ptr(n->get_input_node_shared_ptr(1)); + if (order) { + const auto order_value = order->cast_vector(); + return TransposeDecomposition::supported_cases.count(order_value) != 0 || + FuseTransposeBrgemm::supported_cases.count(order_value) != 0; + } + } + return false; + }; + auto is_supported_fq_op = [](const std::shared_ptr& n) -> bool { // TODO [92179]: Add support of FakeQuantize with non-constants inputs and with binarization algorithm. const auto fq = ov::as_type_ptr(n); @@ -69,6 +77,10 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { is_type(n->get_input_node_shared_ptr(4)); }; + auto is_supported_ternary_eltwise_op = [](const std::shared_ptr &n) -> bool { + return ov::is_type(n); + }; + auto is_supported_binary_eltwise_op = [](const std::shared_ptr &n) -> bool { return ov::is_type(n) || ov::is_type(n) @@ -114,14 +126,51 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { || ov::is_type(n) || ov::is_type(n); }; - return is_supported_fq_op(n) || is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n); + + auto is_supported_softmax = [](const std::shared_ptr &n) -> bool { + if (n->get_input_size() != 1 || n->get_input_partial_shape(0).rank().is_dynamic()) + return false; + int64_t axis = -1; + const auto rank = n->get_input_partial_shape(0).rank(); + if (const auto softmax_v8 = ngraph::as_type_ptr(n)) { + axis = ngraph::normalize_axis(n->get_friendly_name(), softmax_v8->get_axis(), rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(n)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + return axis >= 0 && axis == (rank.get_length() - 1); + }; + + auto is_supported_broadcast_op = [](const std::shared_ptr &n) -> bool { + // Broadcast is supported only for MHA tokenization where there are needed and special checks + if (auto broadcast_v1 = ov::as_type_ptr(n)) { + return broadcast_v1->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY; + } else if (auto broadcast_v3 = ov::as_type_ptr(n)) { + return broadcast_v3->get_broadcast_spec().m_type == ov::op::BroadcastType::NUMPY; + } + return false; + }; + + return is_supported_fq_op(n) || + is_supported_unary_eltwise_op(n) || + is_supported_binary_eltwise_op(n) || + is_supported_ternary_eltwise_op(n) || + is_supported_transpose(n) || + is_supported_softmax(n) || + is_supported_matmul(n) || + is_supported_broadcast_op(n); } auto has_supported_in_out(const std::shared_ptr &n) -> bool { - auto supported = [](descriptor::Tensor& t) -> bool { - static const std::set supported_data_types = - { ngraph::element::f32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 }; - return t.get_partial_shape().is_static() && supported_data_types.count(t.get_element_type()) != 0; + auto supported = [&n](descriptor::Tensor& t) -> bool { + // Todo: int32 isn't supported in general because i32 emitters are required for bit-exact i32 calculations in some cases + // So i32 is supported exclusively for transposes and broadcast + return t.get_partial_shape().is_static() && + (TokenizeSnippets::supported_element_types.count(t.get_element_type()) != 0 || + (t.get_element_type() == ngraph::element::i32 && + (ov::is_type(n) || + ov::is_type(n)))); }; const auto & inputs = n->inputs(); const auto & outputs = n->outputs(); @@ -155,65 +204,15 @@ auto get_num_result_children(const std::shared_ptr &node) -> size_t } return result; } -// Need to update tensor name manually, since intel_cpu::Graph::Replicate() looks at input.get_tensor().get_name(); -// If subgraph->get_output_size() == 1, then the name will be restored correctly from the node name -auto update_out_tensor_name(std::shared_ptr &subgraph) -> void { - bool not_set = true; - for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) { - for (const auto &in : subgraph->get_output_target_inputs(i)) { - if (ov::is_type(in.get_node())) { - const auto& body_result = subgraph->body_ptr()->get_output_op(i); - const auto& body_result_input = body_result->get_input_source_output(0); - op::Subgraph::fill_empty_output_names(subgraph->output(i), body_result_input); - not_set = false; - break; - } - } - } -} } // namespace -bool AppropriateForSubgraph(const std::shared_ptr &node) { +const std::set ngraph::snippets::pass::TokenizeSnippets::supported_element_types = + { ngraph::element::f32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 }; + +bool TokenizeSnippets::AppropriateForSubgraph(const std::shared_ptr &node) { return is_supported_op(node) && has_supported_in_out(node) && node->get_control_dependencies().empty(); } -void SetSnippetsNodeType(const std::shared_ptr &node, SnippetsNodeType nodeType) { - auto &rt = node->get_rt_info(); - rt["SnippetsNodeType"] = nodeType; -} - -SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr &node) { - OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::GetSnippetsNodeType") - auto &rt = node->get_rt_info(); - const auto rinfo = rt.find("SnippetsNodeType"); - if (rinfo == rt.end()) - return SnippetsNodeType::NotSet; - return rinfo->second.as(); -} - -void SetTopologicalOrder(const std::shared_ptr &node, int64_t order) { - OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::SetTopologicalOrder") - auto &rt = node->get_rt_info(); - rt["TopologicalOrder"] = order; -} - -int64_t GetTopologicalOrder(const std::shared_ptr &node) { - auto &rt = node->get_rt_info(); - const auto rinfo = rt.find("TopologicalOrder"); - if (rinfo == rt.end()) - throw ngraph_error("Topological order is required, but not set."); - return rinfo->second.as(); -} - -bool EnumerateNodes::run_on_model(const std::shared_ptr &m) { - OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::EnumerateNodes") - int64_t order = 0; - // Todo: We don't really have to set order for every node, just for subgraph parents and children would be enough - for (auto &node : m->get_ordered_ops()) { - SetTopologicalOrder(node, order++); - } - return true; -} TokenizeSnippets::TokenizeSnippets() { MATCHER_SCOPE(TokenizeSnippets); enum continuation_strategy { @@ -224,7 +223,12 @@ TokenizeSnippets::TokenizeSnippets() { continuation_strategy strategy = continuation_strategy::reset; auto label = std::make_shared(pattern::any_input(), [](const std::shared_ptr &n) { - return GetSnippetsNodeType(n) != SnippetsNodeType::SkippedByPlugin && AppropriateForSubgraph(n); + // todo: MatMul and Transpose ops are always skipped by the SnippetsMarkSkipped pass. + // This is a temporary solution. Either modify SnippetsMarkSkipped + // or align this with the custom MHA tokenization pass. + return (GetSnippetsNodeType(n) != SnippetsNodeType::SkippedByPlugin || + ov::is_type(n) || ov::is_type(n)) + && AppropriateForSubgraph(n); }); ngraph::graph_rewrite_callback callback = [&, strategy](ngraph::pattern::Matcher &m) -> bool { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::CreateSubgraph_callback") @@ -248,7 +252,7 @@ TokenizeSnippets::TokenizeSnippets() { auto subgraph = op::Subgraph::wrap_node_as_subgraph(node); subgraph->get_rt_info()["originalLayersNames"] = getFusedNames(node) + node->get_friendly_name(); ngraph::replace_node(node, subgraph); - update_out_tensor_name(subgraph); + op::update_out_tensor_name(subgraph); }; auto abort_with_strategy = [&](const std::string& message_reset, @@ -456,10 +460,15 @@ TokenizeSnippets::TokenizeSnippets() { // Result op has a single input internal_inputs.push_back(source_result->input_value(0)); } else { - // We have to save explicitly FQ Constants to call ConstantFolding after Tokenization. - // After ConstantFolding we will move remaining non-scalar Constants from body using ConvertConstantsToParameters pass - if ((utils::is_scalar_constant(input_node)) || - (ov::is_type(input_node) && ov::is_type(node))) { + // We need some non-scalar constants inside Subgraph in the following cases: + // [*] We have to save explicitly FQ Constants to call ConstantFolding after Tokenization. + // After ConstantFolding we will move remaining non-scalar Constants from body using ConvertConstantsToParameters pass + // [*] We support Transpose with second Constant input (represents order). This Constant will not be scheduled + // and will only be used to decompose Transpose into a proper Load, Store and Loop combination. + if (ov::is_type(input_node) && + (ngraph::shape_size(input_value.get_shape()) == 1 || + ov::is_type(node) || + op::Subgraph::constant_input_should_be_inside_body(node))) { internal_inputs.push_back(input_node->output(0)); } else { external_inputs.push_back(input_value); @@ -489,18 +498,24 @@ TokenizeSnippets::TokenizeSnippets() { // than the actual number of Constants during tokenization. // To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation) // we should calculate potentional number of non-scalar Constants that will be moved up from body. - size_t hidden_non_scalar_constant_count = 0; + size_t hidden_data_count = 0; + bool need_buffer = false; if (const auto fq_node = ov::as_type_ptr(node)) { - hidden_non_scalar_constant_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node); + hidden_data_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node); + // Ops require a Buffer + } else if (ov::is_type(node) || + ov::is_type(node)) { + need_buffer |= true; } ResultVector body_results; std::vector>> subgraph_result_inputs; for (auto subgraph : input_subgraphs) { - // we should summurize non-scalar Constants count from all input subgraphs - // because we will collapse them with our node and we should get total count of non-scalar Constants - hidden_non_scalar_constant_count += ov::as_type_ptr(subgraph)->get_non_scalar_constants_count(); + // we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs + // because we will collapse them with our node and we should get total count + hidden_data_count += ov::as_type_ptr(subgraph)->get_virtual_port_count(); + need_buffer |= ov::as_type_ptr(subgraph)->is_buffer_needed(); for (auto output : subgraph->outputs()) { bool first_side_consumer = true; @@ -541,13 +556,13 @@ TokenizeSnippets::TokenizeSnippets() { } // todo: move this plugin-specific constraint to the plugin callback - if (body_parameters.size() + body_results.size() + hidden_non_scalar_constant_count > 12) { + if (body_parameters.size() + body_results.size() + hidden_data_count + static_cast(need_buffer) > 12) { const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " + - std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants."; + std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers."; const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " + std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " + - std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants."; + std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers."; return abort_with_strategy(message_reset, message_abort); } @@ -557,7 +572,7 @@ TokenizeSnippets::TokenizeSnippets() { } auto subgraph = op::build_subgraph(node, external_inputs, body, subgraph_name); copy_runtime_info(replaced_nodes, subgraph); - const auto & act_body = subgraph->body(); + const auto& act_body = subgraph->body(); for (size_t i = 0; i < act_body.get_parameters().size(); i++) { act_body.get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name()); } @@ -574,16 +589,17 @@ TokenizeSnippets::TokenizeSnippets() { target_input.replace_source_output(subgraph->output(i)); } } - update_out_tensor_name(subgraph); + op::update_out_tensor_name(subgraph); subgraph->validate_and_infer_types(); - const auto & act_body1 = subgraph->body(); + const auto& act_body1 = subgraph->body(); for (size_t i = 0; i < act_body1.get_parameters().size(); i++) { act_body1.get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name()); } subgraph->get_rt_info()["originalLayersNames"] = fusedNames; - subgraph->set_non_scalar_constants_count(hidden_non_scalar_constant_count); + subgraph->set_virtual_port_count(hidden_data_count); + subgraph->set_buffer_needed(need_buffer); remark(1) << "Replacement (merge) done for: " << subgraph->get_friendly_name() diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp index b94f32af075..787fb8f650d 100644 --- a/src/common/snippets/src/pass/common_optimizations.cpp +++ b/src/common/snippets/src/pass/common_optimizations.cpp @@ -11,7 +11,10 @@ #include "transformations/utils/utils.hpp" #include "snippets/pass/fq_decomposition.hpp" +#include "snippets/pass/softmax_reshape_elimination.hpp" +#include "snippets/pass/explicit_transpose_matmul_inputs.hpp" #include "snippets/op/subgraph.hpp" +#include "snippets/utils.hpp" #include "snippets/itt.hpp" NGRAPH_RTTI_DEFINITION(ngraph::snippets::pass::CommonOptimizations, "Snippets::CommonOptimizations", 0); @@ -31,7 +34,11 @@ void ConvertConstantsToParameters(const std::shared_ptrget_ops()) { auto constant = ov::as_type_ptr(op); - if (!(constant && ngraph::shape_size(constant->get_shape()) != 1ul)) + if (!constant || ngraph::shape_size(constant->get_shape()) == 1ul) + continue; + + const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + if (op::Subgraph::constant_input_should_be_inside_body(child)) continue; auto parameter = std::make_shared(constant->get_element_type(), constant->output(0).get_partial_shape()); @@ -67,9 +74,11 @@ CommonOptimizations::CommonOptimizations() { // Then if Subgraph contains FakeQuantize we enable specific transformation for quantized subgraphs. ngraph::pass::Manager manager; manager.register_pass(); + manager.register_pass(); if (is_quantized) { manager.register_pass(); } + manager.register_pass(); manager.run_passes(body); // At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph diff --git a/src/common/snippets/src/pass/convert_constants.cpp b/src/common/snippets/src/pass/convert_constants.cpp index 3cb791d0130..37cf0f85266 100644 --- a/src/common/snippets/src/pass/convert_constants.cpp +++ b/src/common/snippets/src/pass/convert_constants.cpp @@ -20,11 +20,16 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() { ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars") auto constant = as_type_ptr(m.get_match_root()); - auto scalar = std::make_shared(*constant); + if (ov::shape_size(constant->get_output_shape(0)) != 1) + return false; + // Note that all Constants {1,1,1,1} are converted to Scalar {1} here + // This is needed to simplify shape inference, otherwise {1,1,1,1} Constants can increase output rank + // Also some operations support only scalar shapes, so we need separate scalars and shape [1] + const auto shape = constant->get_output_shape(0).size() == 0 ? ov::Shape{} : ov::Shape{1}; + auto scalar = std::make_shared(ov::op::v0::Constant(*constant, shape)); scalar->set_friendly_name(constant->get_friendly_name()); ngraph::copy_runtime_info(constant, scalar); ngraph::replace_node(constant, scalar); - return true; }; register_matcher(std::make_shared(constants), callback); diff --git a/src/common/snippets/src/pass/explicit_transpose_matmul_inputs.cpp b/src/common/snippets/src/pass/explicit_transpose_matmul_inputs.cpp new file mode 100644 index 00000000000..07e0045d880 --- /dev/null +++ b/src/common/snippets/src/pass/explicit_transpose_matmul_inputs.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/pass/explicit_transpose_matmul_inputs.hpp" +#include "snippets/pass/transpose_decomposition.hpp" +#include "snippets/op/subgraph.hpp" + +#include +#include +#include + + + +ngraph::snippets::pass::ExplicitTransposeMatMulInputs::ExplicitTransposeMatMulInputs() { + MATCHER_SCOPE(ExplicitTransposeMatMulInputs); + + auto m_matmul0 = std::make_shared( + ngraph::pattern::any_input(ngraph::pattern::has_static_shape()), + ngraph::pattern::any_input(ngraph::pattern::has_static_shape())); + + register_matcher(std::make_shared(m_matmul0, matcher_name), + [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ExplicitTransposeMatMulInputs") + auto root = m.get_match_root(); + bool rewritten = false; + + auto matmul0 = ngraph::as_type_ptr(root); + if (!matmul0) + return false; + + for (size_t i = 0; i < matmul0->get_input_size(); i++) { + if (i == 0 && !matmul0->get_transpose_a()) + continue; + if (i == 1 && !matmul0->get_transpose_b()) + continue; + + auto parent1 = matmul0->get_input_node_shared_ptr(i); + auto transpose1 = ngraph::as_type_ptr(parent1); + while (!transpose1 && !ov::is_type(parent1)) { + // We can set supported order and transposed_b(false) only if ops have scalar shapes to avoid shape mismatching + const auto parent_count = parent1->inputs().size(); + bool are_weights_scalar = true; + for (size_t j = 1; j < parent_count; ++j) { + are_weights_scalar = are_weights_scalar && ngraph::shape_size(parent1->get_input_shape(j)) == 1; + } + if (!are_weights_scalar) + break; + + parent1 = parent1->get_input_node_shared_ptr(0); + transpose1 = ngraph::as_type_ptr(parent1); + } + if (!transpose1) + continue; + + const auto transpose_pattern = ngraph::as_type_ptr(transpose1->get_input_node_shared_ptr(1)); + if (!transpose_pattern) + continue; + + auto transposed_order = transpose_pattern->cast_vector(); + std::swap(*transposed_order.rbegin(), *(transposed_order.rbegin() + 1)); + if (pass::TransposeDecomposition::supported_cases.count(transposed_order) == 0) + continue; + + auto new_transpose_order = std::make_shared(transpose_pattern->get_element_type(), + ngraph::Shape{4}, + transposed_order); + new_transpose_order->set_friendly_name(transpose_pattern->get_friendly_name()); + ngraph::copy_runtime_info(transpose_pattern, new_transpose_order); + transpose1->set_argument(1, new_transpose_order); + if (i == 0) { + matmul0->set_transpose_a(false); + } else { + matmul0->set_transpose_b(false); + } + rewritten |= true; + } + + return rewritten; + }); +} diff --git a/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp new file mode 100644 index 00000000000..73347c6475b --- /dev/null +++ b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "snippets/pass/fuse_transpose_brgemm.hpp" +#include "snippets/snippets_isa.hpp" + +#include "snippets/utils.hpp" + +#include "ngraph/opsets/opset1.hpp" +#include "ngraph/rt_info.hpp" +#include "ngraph/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" + +namespace ngraph { +namespace snippets { +namespace pass { +const std::set> FuseTransposeBrgemm::supported_cases = {{0, 2, 1, 3}}; +FuseTransposeBrgemm::FuseTransposeBrgemm() { + MATCHER_SCOPE(FuseTransposeBrgemm); + auto transpose_is_supported = [](const Output& transpose_port) { + const auto transpose_node = transpose_port.get_node_shared_ptr(); + // it's safe to do so because of the patterns we used. alternatively we can do it through pattern_values_map + const auto& constant = as_type_ptr(transpose_node->get_input_node_shared_ptr(1)); + // if Transpose in and out layout is not empty => something was already fused on this port + if (!utils::get_node_output_layout(transpose_node).empty() || + !utils::get_node_output_layout(transpose_node->get_input_node_shared_ptr(0)).empty()) + return false; + const auto& transpose_order = constant->cast_vector(); + // todo: this limitation is due to the fact that offsets are calculated in Kernel, and the only way + // to calc them non-default way is to set Parameter rt_info field. This limitation can be removed if + // the rt_info is properly propagated to the corresponding parameter + if (!is_type(transpose_node->get_input_node_shared_ptr(0)) || + supported_cases.count(transpose_order) == 0) + return false; + return true; + }; + auto constant = pattern::wrap_type(); + auto transpose = pattern::wrap_type({pattern::any_input(), constant}, transpose_is_supported); + auto transpose_matcher = std::make_shared(transpose); + auto brgemm_any = pattern::wrap_type({pattern::any_input(), pattern::any_input()}); + + auto brgemm_in0 = pattern::wrap_type({transpose, pattern::any_input()}); + auto brgemm_in1 = pattern::wrap_type({pattern::any_input(), transpose}); + auto brgemm_out0 = pattern::wrap_type({brgemm_any, constant}); + auto brgemm_or_transpose = std::make_shared(OutputVector{brgemm_in0, brgemm_in1, brgemm_out0}); + + auto callback = [=](pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "FuseTransposeBrgemm") + auto set_layout_from_order = [](const std::shared_ptr& node, const ov::Output& port) { + const auto& const_order = as_type_ptr(node->get_input_node_shared_ptr(1)); + std::vector layout = const_order->cast_vector(); + auto& rt_info = port.get_node_shared_ptr()->get_rt_info(); + rt_info["Layout"] = layout; + }; + auto brgemm = as_type_ptr(m.get_match_root()); + // Transpose on the Brgemm's output + if (!brgemm) { + brgemm = as_type_ptr(m.get_match_root()->get_input_node_shared_ptr(0)); + const auto& brgemm_out = brgemm->output(0); + const auto& transpose_out = m.get_match_value(); + for (const auto& in : transpose_out.get_target_inputs()) + in.replace_source_output(brgemm->output(0)); + set_layout_from_order(as_type_ptr(transpose_out.get_node_shared_ptr()), brgemm_out); + } + for (int i = 0; i < brgemm->get_input_size(); i++) { + const auto& in_value = brgemm->input_value(i); + if (transpose_matcher->match(in_value)) { + const auto& transpose = as_type_ptr(in_value.get_node_shared_ptr()); + set_layout_from_order(transpose, transpose->input_value(0)); + brgemm->set_argument(i, transpose->input_value(0)); + } + } + // need to run validate_and_infer_types manually: either input shapes were updated or + // output Layout was updated (out shape will be updated in validate_and_infer_types()) + brgemm->validate_and_infer_types(); + return true; + }; + register_matcher(std::make_shared(brgemm_or_transpose, matcher_name), callback); +} + +} // namespace pass +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/pass/insert_buffer.cpp b/src/common/snippets/src/pass/insert_buffer.cpp new file mode 100644 index 00000000000..e3fdb0173ef --- /dev/null +++ b/src/common/snippets/src/pass/insert_buffer.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/remarks.hpp" + +#include "snippets/pass/insert_buffer.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include + +ngraph::snippets::pass::InsertBuffer::InsertBuffer(const int32_t allocation_rank) { + MATCHER_SCOPE(InsertBuffer); + // The list of operations that require Buffers on their Inputs and Outputs + const auto pattern = ngraph::pattern::wrap_type(); + + register_matcher(std::make_shared(pattern, matcher_name), + [this, allocation_rank](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertBuffer") + auto root = m.get_match_root(); + bool rewritten = false; + + // check if already has Buffer, Parameter or Constant as an input + for (const auto& input : root->inputs()) { + const auto input_node = input.get_source_output().get_node()->shared_from_this(); + if (!ov::is_type(input_node) && + !ov::is_type(input_node) && + !ov::is_type(input_node)) { + const auto buffer = std::make_shared(input_node, allocation_rank); + root->set_argument(input.get_index(), buffer); + rewritten |= true; + } + if (ov::is_type(input.get_source_output().get_node_shared_ptr()) && + input.get_source_output().get_target_inputs().size() != 1) { + throw ngraph::ngraph_error( + "If Buffer is a input for operation output, this Buffer should be a single consumer for this port"); + } + } + + // check if already has Buffer or outputs is Result + for (const auto& output : root->outputs()) { + const auto target_inputs = output.get_target_inputs(); + if (target_inputs.size() > 1) { + for (const auto& consumer : target_inputs) { + const auto output_node = consumer.get_node()->shared_from_this(); + if (ov::is_type(output_node)) { + // If some of children from one common port are different Buffers, + // we should remove them to insert one common Buffer on one common port + replace_output_update_name(output_node->output(0), output_node->input_value(0)); + } else if (ov::is_type(output_node)) { + // TODO: At this moment operation which is should be wrapped by Buffers doesn't support several childs where one of them is Result + // because Result and Buffer from one root port should have the same register. It's not supported at the moment + // For example, + // Buffer + // | + // Softmax + // / \ + // Buffer Result + throw ngraph::ngraph_error( + "Operation which is should be wrapped by Buffers has few children from one output port where one of them is Result"); + } + } + } + + const auto buffer = std::make_shared(output, allocation_rank); + for (const auto& consumer : output.get_target_inputs()) { + const auto output_node = consumer.get_node()->shared_from_this(); + if (output_node != buffer && + !ov::is_type(output_node) && + !ov::is_type(output_node)) { + consumer.replace_source_output(buffer); + rewritten |= true; + } + } + + const auto new_target_inputs = output.get_target_inputs(); + const auto has_buffer_on_output = std::any_of(new_target_inputs.begin(), new_target_inputs.end(), [](const ov::Input& consumer) { + const auto child = consumer.get_node()->shared_from_this(); + // We check for count of target inputs of Buffer output because + // we created Buffer op with root input previously for the next possible insertions + // Thus, if Buffer wasn't inserted, this op doesn't have target inputs on output + return ov::is_type(child) && child->output(0).get_target_inputs().size() > 0; + }); + if (has_buffer_on_output && new_target_inputs.size() != 1) { + throw ngraph::ngraph_error( + "If Buffer is a input for operation output, this Buffer should be a single consumer for this port"); + } + } + return rewritten; + }); +} diff --git a/src/common/snippets/src/pass/insert_load_store.cpp b/src/common/snippets/src/pass/insert_load_store.cpp index 8d60c4b7cff..707dd71375e 100644 --- a/src/common/snippets/src/pass/insert_load_store.cpp +++ b/src/common/snippets/src/pass/insert_load_store.cpp @@ -15,15 +15,23 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { MATCHER_SCOPE(InsertLoad); register_matcher(std::make_shared( - ngraph::pattern::wrap_type(), matcher_name), + ngraph::pattern::wrap_type(), matcher_name), [this, count](ngraph::pattern::Matcher &m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad") auto root = m.get_match_root(); // check if already has Load as an output - for (auto output : root->outputs()) { - for (auto consumer : output.get_target_inputs()) { - if (ov::is_type(consumer.get_node())) { + for (const auto& output : root->outputs()) { + for (const auto& consumer : output.get_target_inputs()) { + // if a parameter is connected to a Load => we don't need another one + // if a parameter is connected to LoopBegin => there must be Load inside the Loop + // if a parameter is connected to MatMul => we don't need Load (read/write is encapsulated into the brgemm emitter) + // (it's the responsibility of transformation that inserted the Loops) + const auto& consumer_node = consumer.get_node(); + if (ov::is_type(consumer_node) || + ov::is_type(consumer_node) || + ov::is_type(consumer_node) || + ov::is_type(consumer_node)) { return false; } } @@ -33,8 +41,8 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { ngraph::copy_runtime_info(root, load); bool rewritten = false; - for (auto output : root->outputs()) { - for (auto consumer : output.get_target_inputs()) { + for (const auto& output : root->outputs()) { + for (const auto& consumer : output.get_target_inputs()) { if (consumer.get_node()->shared_from_this() != load) { consumer.replace_source_output(load); rewritten |= true; @@ -49,19 +57,23 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) { ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) { MATCHER_SCOPE(InsertStore); register_matcher(std::make_shared( - ngraph::pattern::wrap_type(), matcher_name), + ngraph::pattern::wrap_type(), matcher_name), [this, count](ngraph::pattern::Matcher &m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore") auto root = m.get_match_root(); // check if already has Store as an input - for (auto input : root->inputs()) { - if (ov::is_type(input.get_source_output().get_node())) { + for (const auto& input : root->inputs()) { + const auto& parent_node = input.get_source_output().get_node(); + if (ov::is_type(parent_node) || + ov::is_type(parent_node) || + ov::is_type(parent_node) || + ov::is_type(parent_node)) { return false; } } - auto store = std::make_shared (root->input_value(0), count); + auto store = std::make_shared(root->input_value(0), count); ngraph::copy_runtime_info(root, store); root->set_argument(0, store); return true; diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp new file mode 100644 index 00000000000..f6d83bf6da7 --- /dev/null +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -0,0 +1,285 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/pass/insert_loops.hpp" +#include "snippets/pass/loop_helpers.hpp" +#include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" + +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool single_loop_body) + : m_master_shape(std::move(master_shape)), m_loop_depth(loop_depth), m_vector_size(vector_size), m_single_loop_body(single_loop_body) { + if (m_master_shape.size() < m_loop_depth) + throw ngraph_error("InsertLoops can't insert loops: master shape rank is too small"); +} + +std::vector InsertLoops::calculate_inner_apply_increments(const ov::PartialShape& master, + const std::vector& shapes) { + // Inner Loop applies increments if a dimension is not broadcasted + std::vector apply_increments; + apply_increments.reserve(shapes.size()); + std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments), + [=](const ov::PartialShape& ps) { return utils::get_inner_dim(ps) != 1 && utils::get_inner_dim(master) != 1; }); + return apply_increments; +} +std::vector InsertLoops::calculate_outer_apply_increments(const std::vector& shapes) { + // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1) + std::vector apply_increments; + apply_increments.reserve(shapes.size()); + std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments), + [=](const ov::PartialShape& ps) { return utils::get_outer_dim(ps) != 1 && utils::get_inner_dim(ps) == 1; }); + return apply_increments; +} +std::vector InsertLoops::calculate_finalization_offsets(const ov::PartialShape& master, + const std::vector& shapes) { + const auto inner_work_amount = utils::get_inner_dim(master).get_length(); + std::vector inner_finalization_offsets(shapes.size(), 0); + std::transform(shapes.begin(), shapes.end(), inner_finalization_offsets.begin(), + [=](const ov::PartialShape& ps) { + return utils::get_outer_dim(ps) == 1 && utils::get_inner_dim(ps) != 1 ? -inner_work_amount : 0; + }); + return inner_finalization_offsets; +} + +void insert_loops_explicitly(const ov::NodeVector& ops, const size_t vector_size) { + ov::NodeVector body; + ov::NodeVector body_remainder; + ov::OutputVector body_parameters; + std::vector> body_results; + + // check for potential parameters for new Loop + auto add_body_parameters = [](const std::shared_ptr& op, ov::OutputVector& body_parameters) { + for (const auto& input : op->inputs()) { + auto parent = input.get_source_output().get_node_shared_ptr(); + if (ov::is_type(parent) || + ov::is_type(parent) || + ov::is_type(parent) || + ov::is_type(parent)) { + body_parameters.push_back(input.get_source_output()); + } + } + }; + + // check for potential results for new Loop + auto add_body_results = [](const std::shared_ptr& op, std::vector>& body_results) { + for (const auto& output : op->outputs()) { + for (const auto& target_input : output.get_target_inputs()) { + auto child = target_input.get_node(); + if (ov::is_type(child) || + ov::is_type(child) || + ov::is_type(child) || + ov::is_type(child)) { + body_results.push_back(target_input); + } + } + } + }; + + // check for potential missing body ops for new loop + std::function& op, ov::NodeVector& body)> add_missing_body_ops; + add_missing_body_ops = [&](const std::shared_ptr& op, ov::NodeVector& body) { + if (body_remainder.size()) { + for (const auto& input : op->inputs()) { + auto parent = input.get_source_output().get_node_shared_ptr(); + auto iter = std::find(body_remainder.begin(), body_remainder.end(), parent); + if (iter != body_remainder.end()) { + *std::back_inserter(body) = std::move(*iter); + add_missing_body_ops(parent, body); + add_body_parameters(parent, body_parameters); + add_body_results(op, body_results); + } + } + } + }; + + auto wrap_body_by_loop = [&](const ov::NodeVector& body, const ov::OutputVector& body_parameters, const std::vector>& body_results) { + NGRAPH_CHECK(!body_parameters.empty(), "The count of parameters for loop should be more than zero to create loop"); + NGRAPH_CHECK(!body_results.empty(), "The count of results for loop should be more than zero to create loop"); + std::vector body_shapes; + const auto count_io = body_parameters.size() + body_results.size(); + body_shapes.reserve(count_io); + std::transform(body_parameters.begin(), body_parameters.end(), std::back_inserter(body_shapes), + [](const ov::Output& out) { return out.get_partial_shape(); }); + std::transform(body_results.begin(), body_results.end(), std::back_inserter(body_shapes), + [](const ov::Input& in) { return in.get_partial_shape(); }); + + auto body_master_shape = body_shapes.front(); + for (const auto& shape : body_shapes) { + NGRAPH_CHECK(PartialShape::broadcast_merge_into(body_master_shape, shape, ::ngraph::op::AutoBroadcastType::NUMPY), + "Loop input and output must be numpy broadcastable"); + } + const auto inner_work_amount = utils::get_inner_dim(body_master_shape).get_length(); + const auto outer_work_amount = utils::get_outer_dim(body_master_shape).get_length(); + + auto apply_increments = InsertLoops::calculate_inner_apply_increments(body_master_shape, body_shapes); + std::vector inner_finalization_offsets(body_shapes.size(), 0); + if (outer_work_amount > 1) { + inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(body_master_shape, body_shapes); + } + + const auto& inner_loop_begin = op::insertLoopBeginAfterOutputs(body_parameters); + const auto& inner_loop_end = op::insertLoopEndBeforeInputs( + body_results, inner_loop_begin, inner_work_amount, vector_size, + apply_increments, inner_finalization_offsets); + // set internal flag to enable scalar vs vector loop optimizations + inner_loop_end->has_outer_loop = outer_work_amount > 1; + // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in + // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called + // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg + // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency + // on LoopBegin to guarantee that the constants are executed inside the Loop. + for (const auto& n : body) { + if (auto c = std::dynamic_pointer_cast(n)) { + c->add_control_dependency(inner_loop_begin); + } + } + + if (outer_work_amount > 1) { + std::vector apply_increments = InsertLoops::calculate_outer_apply_increments(body_shapes); + std::vector outer_finalization_offsets(body_shapes.size(), 0); + const auto& outer_loop_begin = op::insertLoopBegin(body_parameters); + op::insertLoopEnd(body_results, outer_loop_begin, outer_work_amount, 1lu, + apply_increments, outer_finalization_offsets); + } + }; + + auto op_is_outside_loop = [](const std::shared_ptr& op) -> bool { + if (ov::is_type(op) || + ov::is_type(op) || + ov::is_type(op)) + return true; + auto& rt = op->get_rt_info(); + auto outside_rt = rt.find("outside_loop"); + bool is_outside = false; + // If rt info isn't setted it means that op should be inside loop by default + if (outside_rt != rt.end()) { + is_outside = outside_rt->second.as(); + } + return is_outside; + }; + + for (auto iter = ops.begin(); iter < ops.end(); iter++) { + const auto op = *iter; + // Need to check for that op should be inside or outside loop + if (op_is_outside_loop(op)) { + continue; + } + + // If we meet loopBegin or Brgemm, it means that all previous nodes from ordered body + // should be in one body. It's like stop signal + const auto& loop_begin = ov::as_type_ptr(op); + const auto& brgemm = ov::as_type_ptr(op); + if (loop_begin || brgemm) { + if (!body.empty()) { + if (!body_results.empty()) { + wrap_body_by_loop(body, body_parameters, body_results); + } else { + // If there aren't body results, it means that the current body ops are inputs of the next some operations in ordered_ops + // So this set of the current body ops is part of the future body loop. + // We should save them to add in body ops in the future + std::move(body.begin(), body.end(), std::back_inserter(body_remainder)); + } + } + + // we should skip the next existing Loop body + if (loop_begin) { + const auto &loop_end = loop_begin->get_loop_end(); + iter = std::find(iter, ops.end(), loop_end); + } + + // clear loop body to create the next + body.clear(); + body_parameters.clear(); + body_results.clear(); + } else { + add_missing_body_ops(op, body); + add_body_parameters(op, body_parameters); + add_body_results(op, body_results); + + body.push_back(op); + } + } + + if (!body.empty()) { + wrap_body_by_loop(body, body_parameters, body_results); + } +} + +bool InsertLoops::run_on_model(const std::shared_ptr &model) { + RUN_ON_FUNCTION_SCOPE(InsertLoops); + if (m_master_shape.is_dynamic()) + throw ngraph_error("InsertLoops doesn't support dynamic shapes yet"); + + const auto inner_work_amount = utils::get_inner_dim(m_master_shape).get_length(); + const auto outer_work_amount = m_loop_depth == 2 ? utils::get_outer_dim(m_master_shape).get_length() : 1; + + auto ops = model->get_ordered_ops(); + ParameterVector commonParams = model->get_parameters(); + // Note that topological sort parses node arguments in reversed order, but results are added - in direct order + // So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter + const auto& orig_results = model->get_results(); + ResultVector commonResults(orig_results.rbegin(), orig_results.rend()); + std::vector ioShapes; + + const auto& body_rt_info = model->get_rt_info(); + const auto& plugin_shapes = body_rt_info.find("PluginShapesOverride"); + if (plugin_shapes == body_rt_info.end()) { + throw ngraph_error("InsertLoops requires PluginShapesOverride rt_info field"); + } else { + const auto& new_shapes = plugin_shapes->second.as>>(); + if (new_shapes.size() != commonResults.size() + commonParams.size()) + throw ngraph_error("InsertLoops got invalid number of plugin-overriden shapes"); + for (int i = 0; i < commonParams.size(); i++) + ioShapes.emplace_back(new_shapes[i]); + // reverse overriden_shapes for results since commonResults are reversed with respect to model->get_parameters() + for (int i = 0; i < commonResults.size(); i++) + ioShapes.emplace_back(new_shapes[new_shapes.size() - 1 - i]); + } + + if (inner_work_amount > 0) { + if (m_single_loop_body) { + const auto apply_increments = InsertLoops::calculate_inner_apply_increments(m_master_shape, ioShapes); + std::vector inner_finalization_offsets(ioShapes.size(), 0); + if (outer_work_amount > 1) { + inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(m_master_shape, ioShapes); + } + const auto& inner_loop_begin = op::insertLoopBegin(commonParams); + const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount, + m_vector_size, apply_increments, inner_finalization_offsets); + // set internal flag to enable scalar vs vector loop optimizations + inner_loop_end->has_outer_loop = outer_work_amount > 1; + // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in + // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called + // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg + // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency + // on LoopBegin to guarantee that the constants are executed inside the Loop. + for (const auto& n : model->get_ordered_ops()) { + if (auto c = std::dynamic_pointer_cast(n)) + c->add_control_dependency(inner_loop_begin); + else if (n == inner_loop_begin) + break; + } + + if (outer_work_amount > 1) { + std::vector apply_increments = InsertLoops::calculate_outer_apply_increments(ioShapes); + const auto& outer_loop_begin = op::insertLoopBegin(commonParams); + op::insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1lu, apply_increments); + } + } else { + insert_loops_explicitly(ops, m_vector_size); + } + } + + return true; +} + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp index 0e237ed3219..397345cc456 100644 --- a/src/common/snippets/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp @@ -7,6 +7,8 @@ #include "snippets/pass/insert_movebroadcast.hpp" #include "snippets/snippets_isa.hpp" +#include "snippets/utils.hpp" +#include #include #include @@ -17,43 +19,47 @@ using namespace ngraph; namespace { -std::shared_ptr broadcast_node_last_dim(const ngraph::Output& value, - const ov::Shape& target_shape, const ov::Shape& normalized_shape) { - std::shared_ptr broadcasted_node = value.get_node_shared_ptr(); - - if (target_shape == value.get_shape()) { - return broadcasted_node; - } - // Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting - // will be handled by pointer arithmetics in TileScheduler - if (*target_shape.rbegin() != *normalized_shape.rbegin()) { - ov::Shape broadcasted_shape = normalized_shape; - *broadcasted_shape.rbegin() = *target_shape.rbegin(); - broadcasted_node = std::make_shared(broadcasted_node, broadcasted_shape); - } - - return broadcasted_node; -} - - -std::pair> get_numpy_broadcast_shapes(const std::vector& input_shapes) { +std::pair> get_numpy_broadcast_partial_shapes(const std::vector& input_shapes) { ov::PartialShape target_shape = input_shapes.front(); for (auto i = 1; i < input_shapes.size(); i++) { if (!ov::PartialShape::broadcast_merge_into(target_shape, input_shapes[i], op::AutoBroadcastType::NUMPY)) throw ngraph::ngraph_error("InsertMoveBroadcast: Failed broadcast-merge input shapes"); } - std::vector normalized_shapes; + std::vector normalized_shapes; for (const auto& input : input_shapes) { - ov::Shape padded_shape{input}; + ov::PartialShape padded_shape{input}; padded_shape.insert(padded_shape.begin(), target_shape.size() - padded_shape.size(), 1); normalized_shapes.push_back(std::move(padded_shape)); } - return {target_shape.get_shape(), normalized_shapes}; + return {target_shape, normalized_shapes}; } } // namespace +ngraph::Output ngraph::snippets::pass::InsertMoveBroadcast::BroadcastNodeLastDim( + const ngraph::Output& value, const ov::PartialShape& target_shape, const ov::PartialShape& normalized_shape) { + if (target_shape == value.get_partial_shape()) { + return value; + } + + // Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting + // will be handled by pointer arithmetics inside outer LoopEmitter + if (*target_shape.rbegin() != *normalized_shape.rbegin()) { + ov::PartialShape broadcasted_shape = normalized_shape; + *broadcasted_shape.rbegin() = *target_shape.rbegin(); + const auto broadcast_node = std::make_shared(value, broadcasted_shape); + // BroadcastMove should be immediately executed after its input op (input op is node with output which should be broadcasted). + // For example, to execute Broadcast outside of a Loop We transfer control dependents and copy rt info + broadcast_node->add_node_control_dependents(value.get_node_shared_ptr()); + ov::copy_runtime_info(value.get_node_shared_ptr(), broadcast_node); + + return broadcast_node->output(0); + } + + return value; +} + ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { MATCHER_SCOPE(InsertMoveBroadcast); ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) { @@ -64,31 +70,35 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { return false; } - auto is_scalar_constant = [](const ov::Output& v){ - if (auto constant = ov::as_type_ptr(v.get_node_shared_ptr())) { - if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) { - return true; - } - } - return false; + auto is_ignored_node = [](const ov::Output& v){ + // We don't need to insert BroadcastMove after the following operations: + // - Scalar has emitter with explicit broadcasting + // - VectorBuffer has scalar output shape to avoid broadcast conflicts and manually shape insertion. + return utils::is_scalar_constant(v.get_node_shared_ptr()) || + ov::is_type(v.get_node_shared_ptr()); }; - std::vector input_shapes; - std::vector ignore_as_scalar; + std::vector input_shapes; + std::vector is_ignored; for (const auto& val : values) { - input_shapes.emplace_back(val.get_shape()); - ignore_as_scalar.push_back(is_scalar_constant(val)); + input_shapes.emplace_back(val.get_partial_shape()); + is_ignored.push_back(is_ignored_node(val)); + // Do not insert MoveBroadcast if any of the last dims is dynamic, + // since we don't know if we really need it. In these cases, broadcasting will be performed + // by outer Loop based on runtime shapes. + if (!is_ignored.back() && !input_shapes.back().rbegin()->is_static()) + return false; } // find the output tensor's shape, then broadcast all inputs so that they are compatible with respect to the last dim - auto bcast_shapes = get_numpy_broadcast_shapes(input_shapes); + auto bcast_shapes = get_numpy_broadcast_partial_shapes(input_shapes); ngraph::OutputVector broadcasted_inputs; for (size_t i = 0; i < values.size(); ++i) { - if (ignore_as_scalar[i]) { + if (is_ignored[i]) { broadcasted_inputs.push_back(values[i]); } else { - auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]); - ngraph::copy_runtime_info(root, node); + auto node = BroadcastNodeLastDim(values[i], bcast_shapes.first, bcast_shapes.second[i]); + ngraph::copy_runtime_info(root, node.get_node_shared_ptr()); broadcasted_inputs.push_back(node); } } diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp index 5a30f2c2d5a..b4fdb2506dc 100644 --- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp +++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp @@ -34,10 +34,10 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro return false; } - auto inshape = root->input(0).get_shape(); - auto outshape = root->output(0).get_shape(); + auto inshape = root->input(0).get_partial_shape(); + auto outshape = root->output(0).get_partial_shape(); - auto broadcastload = std::make_shared(param, outshape); + auto broadcastload = std::make_shared(param, outshape, ov::as_type_ptr(input)->get_offset()); ngraph::copy_runtime_info(root, broadcastload); ngraph::replace_node(root, broadcastload); diff --git a/src/common/snippets/src/pass/loop_fusion.cpp b/src/common/snippets/src/pass/loop_fusion.cpp new file mode 100644 index 00000000000..587daa79121 --- /dev/null +++ b/src/common/snippets/src/pass/loop_fusion.cpp @@ -0,0 +1,331 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "snippets/snippets_isa.hpp" +#include "snippets/pass/loop_fusion.hpp" +#include "snippets/utils.hpp" + +namespace { +using InputSet = std::set>; +using Edge = std::pair, InputSet>; + +auto can_be_merged(const std::shared_ptr& loop_end_up, + const std::shared_ptr& loop_begin_down) -> bool { + if (!loop_end_up || !loop_begin_down) + return false; + + const auto loop_end_down = loop_begin_down->get_loop_end(); + const auto loop_begin_up = loop_end_up->get_loop_begin(); + if (loop_end_down->get_work_amount() != loop_end_up->get_work_amount() || + loop_end_down->get_increment() != loop_end_up->get_increment()) + return false; + + // If between Loops there are common dependencies (for example, reducing operations), we cannot merge these Loops + // Example, when there is HorizonMax op between Loops: + // Data + // VectorBuffer LoopBegin + // \ Load | \ + // Maximum | / + // / LoopEnd + // HorizonMax | + // \ LoopBegin + // \ Load \ + // Subtract | + // Store / + // LoopEnd + auto up_dependent_ptrs = loop_end_up->get_control_dependents(); + ov::NodeVector up_dependents(up_dependent_ptrs.size(), nullptr); + std::transform(up_dependent_ptrs.begin(), up_dependent_ptrs.end(), up_dependents.begin(), [](ngraph::Node* node) { return node->shared_from_this(); }); + auto down_dependencies = loop_begin_down->get_control_dependencies(); + std::sort(up_dependents.begin(), up_dependents.end()); + std::sort(down_dependencies.begin(), down_dependencies.end()); + std::vector> common_nodes; + std::set_intersection(up_dependents.begin(), up_dependents.end(), down_dependencies.begin(), down_dependencies.end(), + std::back_inserter(common_nodes)); + // TODO: Add check for sequence/subgraph of depending nodes between Loops. + // At these moment we should have full list of dependencies and dependents of Loops to find intersection, + // not just first dependent of LoopEnd and first dependency of LoopBegin + return common_nodes.size() == 0; +} + +auto get_buffer_and_loop_end(const std::shared_ptr& loop_begin_down, + std::shared_ptr& loop_end_up, + std::shared_ptr& buffer) -> bool { + size_t fusion_input_num = 0; + for (const auto& parent : loop_begin_down->input_values()) { + const auto parent_shared = parent.get_node_shared_ptr(); + if (ov::is_type(parent_shared) || + ov::is_type(parent_shared) || + ov::is_type(parent_shared)) + continue; + + // We can fuse Loops even LoopBegin has several the same inputs (the common Buffer/LoopEnd) + if (buffer && buffer == parent_shared || !buffer && loop_end_up && loop_end_up == parent_shared) + continue; + + loop_end_up = ngraph::as_type_ptr(parent_shared); + buffer = ov::as_type_ptr(parent_shared); + if (buffer) { + if (buffer->output(0).get_target_inputs().size() == 0 || + buffer->get_input_size() != 1 || + buffer->get_input_source_output(0).get_target_inputs().size() != 1) + return false; + + loop_end_up = ngraph::as_type_ptr(buffer->get_input_node_shared_ptr(0)); + } + if (loop_end_up) + fusion_input_num++; + } + + return fusion_input_num == 1; +} + +auto collect_loop_inputs(const std::shared_ptr& loop_begin, + const std::shared_ptr& buffer, + std::vector& new_loop_inputs, + std::vector& new_ptr_increments, + std::vector& new_finalization_offsets) -> void { + const auto loop_end = loop_begin->get_loop_end(); + const auto ptr_increments = loop_end->get_ptr_increments(); + const auto finalization_offsets = loop_end->get_finalization_offsets(); + for (size_t i = 0; i < loop_begin->get_input_size(); i++) { + const auto input = loop_begin->input(i); + // Skip target Buffer + if (input.get_source_output().get_node_shared_ptr() != buffer) { + const auto edge = Edge{ input.get_source_output(), + loop_begin->output(input.get_index()).get_target_inputs() }; + new_loop_inputs.push_back(edge); + new_ptr_increments.push_back(ptr_increments[i]); + new_finalization_offsets.push_back(finalization_offsets[i]); + // Remove LoopBegin from Parent as target input + input.get_source_output().remove_target_input(input); + } + } +} + +auto collect_loop_outputs(const std::shared_ptr& loop_end, + const std::shared_ptr& buffer, + std::vector& new_loop_outputs, + std::vector& new_ptr_increments, + std::vector& new_finalization_offsets, + const bool reduce_max_case) -> bool { + const auto loop_begin = loop_end->get_loop_begin(); + const auto ptr_increments = loop_end->get_ptr_increments(); + const auto finalization_offsets = loop_end->get_finalization_offsets(); + bool is_current_reduce_max_case = false; + for (size_t i = 0; i < loop_end->get_output_size(); i++) { + // ReduceMax case. When Loop cannot have empty output as ngraph op, + // we should have fake edge through all Loops (LoopBegin->LoopEnd) which connect src and dst data. + // If we merge these this Loop and Loop Before, we should remove this fake edge + // because now we have real data for storing + auto new_input_node = loop_end->get_input_node_shared_ptr(i); + if (ov::is_type(new_input_node)) { + // We set temporary boolean variable because this value is for the next LoopEnd (upper), not for the current LoopEnd + is_current_reduce_max_case = true; + // Remove LoopEnd from Parent as target input + loop_end->input_value(i).remove_target_input(loop_end->input(i)); + } else { + const auto output = loop_end->output(i); + // Skip target Buffer + InputSet target_inputs; + for (const auto& input : output.get_target_inputs()) { + if (input.get_node()->shared_from_this() != buffer || reduce_max_case) { + target_inputs.insert(input); + } + } + + if (target_inputs.size()) { + const auto edge = Edge{loop_end->input_value(output.get_index()), target_inputs}; + new_loop_outputs.push_back(edge); + new_ptr_increments.push_back(ptr_increments[loop_begin->get_input_size() + i]); + new_finalization_offsets.push_back(finalization_offsets[loop_begin->get_input_size() + i]); + // Remove LoopEnd from Parent as target input + loop_end->input_value(i).remove_target_input(loop_end->input(i)); + } + } + } + + return is_current_reduce_max_case; +} + +} // namespace + + +bool ngraph::snippets::pass::LoopFusion::Merge(const std::shared_ptr& loop_begin_down) { + if (!loop_begin_down) { + return false; + } + + std::shared_ptr loop_end_up = nullptr; + std::shared_ptr buffer = nullptr; + // Initialize the corresponding upper LoopEnd and Buffer + if (!get_buffer_and_loop_end(loop_begin_down, loop_end_up, buffer)) { + return false; + } + // Check for conditions of fusion + if (!can_be_merged(loop_end_up, loop_begin_down)) { + return false; + } + + const auto loop_end_down = loop_begin_down->get_loop_end(); + const auto loop_begin_up = loop_end_up->get_loop_begin(); + const auto new_input_count = loop_begin_up->get_input_size() + loop_begin_down->get_input_size(); + const auto new_output_count = loop_end_up->get_output_size() + loop_end_down->get_output_size(); + const auto new_io_count = new_input_count + new_output_count; + const auto ptr_increments_up = loop_end_up->get_ptr_increments(); + const auto ptr_increments_down = loop_end_down->get_ptr_increments(); + const auto finalization_offsets_up = loop_end_up->get_finalization_offsets(); + const auto finalization_offsets_down = loop_end_down->get_finalization_offsets(); + std::vector new_ptr_increments, new_finalization_offsets; + new_ptr_increments.reserve(new_io_count); + new_finalization_offsets.reserve(new_io_count); + + // Collect new loop inputs + std::vector loop_inputs; + loop_inputs.reserve(new_input_count); + new_ptr_increments.reserve(new_io_count); + new_finalization_offsets.reserve(new_io_count); + collect_loop_inputs(loop_begin_up, buffer, loop_inputs, new_ptr_increments, new_finalization_offsets); + collect_loop_inputs(loop_begin_down, buffer, loop_inputs, new_ptr_increments, new_finalization_offsets); + + // Collect new Loop outputs + std::vector loop_outputs; + loop_outputs.reserve(new_output_count); + // We can fuse Loop with maximum accumulator pattern only with Smth input + // So firstly, we analyze LoopEnd down (it's possible maximum accumulator pattern), set `reduce_max_case` variable + // if it's really maximum accumulator pattern, and then analyze LoopEnd up using `reduce_max_case` variable + const bool reduce_max_case = collect_loop_outputs(loop_end_down, buffer, loop_outputs, new_ptr_increments, new_finalization_offsets, false); + collect_loop_outputs(loop_end_up, buffer, loop_outputs, new_ptr_increments, new_finalization_offsets, reduce_max_case); + if (reduce_max_case) { + const auto target_inputs = loop_begin_down->output(0).get_target_inputs(); + NGRAPH_CHECK(target_inputs.size() == 1, "LoopBegin in ReduceMax should have only one consumer (Load) for out port 0"); + const auto load = ov::as_type_ptr(target_inputs.begin()->get_node()->shared_from_this()); + NGRAPH_CHECK(load != nullptr, "LoopBegin in ReduceMax should have only one consumer for out port 0 - Load"); + + const auto store = ov::as_type_ptr(loop_end_up->get_input_node_shared_ptr(0)); + NGRAPH_CHECK(store != nullptr, "Before LoopEnd should be Store emitter"); + + // Connect vector emitters before Store and after Load + load->output(0).replace(store->get_input_source_output(0)); + } + + const auto new_increment = loop_end_up->get_increment(); + const auto new_work_amount = loop_end_up->get_work_amount(); + + // Create new LoopBegin + OutputVector new_loop_begin_inputs; + new_loop_begin_inputs.reserve(loop_inputs.size()); + for (const auto& loop_input : loop_inputs) { + const auto data_output = loop_input.first; + new_loop_begin_inputs.push_back(data_output); + } + const auto new_loop_begin = std::make_shared(new_loop_begin_inputs); + NGRAPH_CHECK(new_loop_begin->get_input_size() == loop_inputs.size(), "New LoopBegin has incorrect count of inputs."); + + // Connect new LoopBegin to input edges + for (size_t i = 0; i < loop_inputs.size(); i++) { + const auto edge = loop_inputs[i]; + for (auto& target_input : edge.second) { + target_input.replace_source_output(new_loop_begin->output(i)); + } + } + + // Create new LoopEnd + OutputVector new_loop_end_inputs; + new_loop_end_inputs.reserve(loop_outputs.size() + 1); // + 1 - for loop_begin + for (const auto& loop_output : loop_outputs) { + const auto data_output = loop_output.first; + new_loop_end_inputs.push_back(data_output); + } + new_loop_end_inputs.push_back(new_loop_begin->output(new_loop_begin->get_input_size())); + const auto new_loop_end = std::make_shared(new_loop_end_inputs, new_work_amount, new_increment, + new_ptr_increments, new_finalization_offsets); + NGRAPH_CHECK(new_loop_end->get_output_size() == loop_outputs.size(), "New LoopEnd has incorrect count of outputs."); + // Connect new LoopEnd to output edges + for (size_t i = 0; i < loop_outputs.size(); i++) { + const auto edge = loop_outputs[i]; + auto new_output = new_loop_end->output(i); + for (auto& target_input : edge.second) { + target_input.replace_source_output(new_output); + } + } + + if (reduce_max_case) { + loop_end_down->output(0).replace(buffer->output(0)); + } else { + // Remove old Loops and Load/Store if there are around Buffer + for (size_t i = 0; i < loop_end_up->get_input_size() - 1; i++) { + auto new_output = loop_end_up->input_value(i); + loop_end_up->output(i).replace(new_output); + new_output.remove_target_input(loop_end_up->input(i)); + } + for (size_t i = 0; i < loop_begin_down->get_input_size(); i++) { + const auto output_target_inputs = loop_begin_down->output(i).get_target_inputs(); + const auto new_output = loop_begin_down->input_value(i); + for (const auto &target_input : output_target_inputs) { + target_input.replace_source_output(new_output); + } + + // Clear old Buffer children + new_output.remove_target_input(loop_begin_down->input(i)); + } + } + + new_loop_end->has_outer_loop = loop_end_down->has_outer_loop || loop_end_up->has_outer_loop; + + loop_begin_up->transfer_control_dependents(new_loop_begin); + loop_begin_down->transfer_control_dependents(new_loop_begin); + loop_end_up->transfer_control_dependents(new_loop_end); + loop_end_down->transfer_control_dependents(new_loop_end); + new_loop_begin->add_node_control_dependencies(loop_begin_up); + new_loop_begin->add_node_control_dependencies(loop_begin_down); + new_loop_end->add_node_control_dependencies(loop_end_up); + new_loop_end->add_node_control_dependencies(loop_end_down); + + // If there was Buffer between Loops, after Loop fusion + // we should remove the Buffer node and MemoryAccess nodes if it's needed + if (buffer) { + const auto buffer_input = buffer->get_input_node_shared_ptr(0); + const auto buffer_output = buffer->output(0).get_target_inputs().begin()->get_node()->shared_from_this(); + + // If after merging there are Load and Store, we should remove them + if (const auto store = ov::as_type_ptr(buffer_input)) { + store->output(0).replace(store->input_value(0)); + } + if (const auto load = ov::as_type_ptr(buffer_output)) { + load->output(0).replace(load->input_value(0)); + } + + // Remove Buffer if there are no Loops and MatMul after Loop fusion + // because only these operations can have Buffer node on inputs and outputs. + // So if there aren't, it means that Buffer is extra, and we can remove it + if (!ov::is_type(buffer_output) && !ov::is_type(buffer_input) && + !ov::is_type(buffer_output) && !ov::is_type(buffer_input)) { + buffer->output(0).replace(buffer->input_value(0)); + } + } + + return true; +} + +ngraph::snippets::pass::LoopFusion::LoopFusion() { + MATCHER_SCOPE(LoopFusion); + + auto m_loop_begin = ngraph::pattern::wrap_type(); + + auto callback = [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::LoopFusion") + auto& pattern_to_output = m.get_pattern_value_map(); + const auto loop_begin = ngraph::as_type_ptr(pattern_to_output.at(m_loop_begin).get_node_shared_ptr()); + const auto status = Merge(loop_begin); + return status; + }; + + auto matcher = std::make_shared(m_loop_begin, matcher_name); + register_matcher(matcher, callback); +} diff --git a/src/common/snippets/src/pass/loop_helpers.cpp b/src/common/snippets/src/pass/loop_helpers.cpp new file mode 100644 index 00000000000..696f7816a27 --- /dev/null +++ b/src/common/snippets/src/pass/loop_helpers.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph/op/op.hpp" +#include "snippets/pass/loop_helpers.hpp" + +namespace ngraph { +namespace snippets { +namespace op { +std::shared_ptr insertLoopBeginAfterOutputs(const OutputVector& originalOutputs) { + std::vector>> originalChildInputs; + for (const auto& out : originalOutputs) { + originalChildInputs.push_back(out.get_target_inputs()); + } + + auto loop_begin = std::make_shared(originalOutputs); + + for (int i = 0; i < originalChildInputs.size(); i++) { + for (auto& input : originalChildInputs[i]) { + input.replace_source_output(loop_begin->output(i)); + } + } + return loop_begin; +} + +std::shared_ptr insertLoopEndBeforeInputs(const std::vector>& originalInputs, + const std::shared_ptr& loopBegin, + size_t work_amount, size_t increment, + std::vector apply_increment, + std::vector finalization_offsets) { + OutputVector originalParentOutputs; + for (const auto& in : originalInputs) { + originalParentOutputs.push_back(in.get_source_output()); + } + originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1)); + auto loop_end = std::make_shared(originalParentOutputs, work_amount, increment, + std::move(apply_increment), std::move(finalization_offsets)); + + for (int i = 0; i < originalInputs.size(); i++) { + originalInputs[i].replace_source_output(loop_end->output(i)); + } + return loop_end; +} + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/pass/matmul_to_brgemm.cpp b/src/common/snippets/src/pass/matmul_to_brgemm.cpp new file mode 100644 index 00000000000..b74fb3e68cc --- /dev/null +++ b/src/common/snippets/src/pass/matmul_to_brgemm.cpp @@ -0,0 +1,45 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "snippets/pass/matmul_to_brgemm.hpp" + +#include "snippets/op/brgemm.hpp" + +#include "ngraph/opsets/opset1.hpp" +#include "ngraph/rt_info.hpp" +#include "ngraph/pattern/op/wrap_type.hpp" + +namespace ngraph { +namespace snippets { +namespace pass { + +MatMulToBrgemm::MatMulToBrgemm() { + MATCHER_SCOPE(MatMulToBrgemm); + auto matmul_pattern = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), + ngraph::pattern::any_input()}); + + auto callback = [=](ngraph::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::MatMulToBrgemm") + auto& pm = m.get_pattern_value_map(); + const auto matmul = as_type_ptr(pm.at(matmul_pattern).get_node_shared_ptr()); + // Brgemm doesn't support transposed inputs currently, so we don't convert such matmuls + if (matmul->get_transpose_a() || matmul->get_transpose_b()) + return false; + + auto brgemm = std::make_shared(matmul->get_input_source_output(0), matmul->get_input_source_output(1)); + brgemm->set_friendly_name(matmul->get_friendly_name()); + ngraph::copy_runtime_info(matmul, brgemm); + ngraph::replace_node(matmul, brgemm); + return true; + }; + + auto m = std::make_shared(matmul_pattern, matcher_name); + register_matcher(m, callback); +} + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp new file mode 100644 index 00000000000..69a166140b4 --- /dev/null +++ b/src/common/snippets/src/pass/mha_tokenization.cpp @@ -0,0 +1,394 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/pass/mha_tokenization.hpp" +#include "snippets/pass/tokenization.hpp" +#include "snippets/op/subgraph.hpp" + +#include +#include +#include +#include + + +namespace { +auto is_supported_tensor(const ngraph::descriptor::Tensor& t) -> bool { + // TODO: Add support of all supported by common tokenization element types + // return ngraph::snippets::pass::TokenizeSnippets::supported_element_types.count(input.get_element_type()) != 0; + // Also only 4D is supported at the moment + return t.get_element_type() == ngraph::element::f32 && t.get_partial_shape().is_static() && t.get_shape().size() == 4; +} + +// TODO: Add support of FQ, Reshape? +auto is_supported_op(const std::shared_ptr& node) -> bool { + return ngraph::snippets::pass::TokenizeSnippets::AppropriateForSubgraph(node) && + (ngraph::is_type(node) || + ngraph::is_type(node) || + ngraph::is_type(node)); +} + +auto is_valid_transpose(const std::shared_ptr& node, std::vector expected_order) -> bool { + auto valid_transpose_order = [expected_order](const std::shared_ptr& node) -> bool { + const auto transpose_pattern = ngraph::as_type_ptr(node); + if (!transpose_pattern) + return false; + return transpose_pattern->cast_vector() == expected_order; + }; + + return node && node->get_output_target_inputs(0).size() == 1 && node->get_shape().size() == 4 && + valid_transpose_order(node->get_input_node_shared_ptr(1)) && is_supported_tensor(node->get_input_tensor(0)); +} + +auto tokenize_broadcast(const std::shared_ptr& interm_op, ov::NodeVector& ordered_ops) -> void { + // We can tokenize Broadcast op only when output shape of child doesn't depend on Broadcast shape without last dimension. + // Snippets remove Broadcast op and insert BroadcastMove if last dimensions before and after Broadcast are different. + // Otherwise, we can lose original shape. + // Example: + // in0 [1, 1, 1] in0 [1, 1, 1] in0 [1, 1, 1] in0 [1, 1, 1] + // Broadcast [1, 10, 1] / \ / + // \ / --->>> Add + // Add | + // Result [1, 10, 1] Result [1, 1, 1] + + ov::PartialShape new_output_shape(std::vector{1}); + ov::NodeVector broadcast_nodes; + + auto skip_last_dim = [](const ov::PartialShape& shape) { + return ov::PartialShape(std::vector{shape.begin(), shape.end() - 1}); + }; + + for (auto input : interm_op->inputs()) { + auto broadcast = ov::as_type_ptr(input.get_source_output().get_node_shared_ptr()); + // TODO: Can we reuse AppropriateForSubgraph here? Seems like it's huge check for Broadcast + if (broadcast && broadcast->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY && + broadcast->get_output_target_inputs(0).size() == 1) { + broadcast_nodes.push_back(broadcast); + + const auto pshape = broadcast->get_input_partial_shape(0); + if (pshape.rank().is_static() && pshape.size() > 2) { + ov::PartialShape::broadcast_merge_into(new_output_shape, + skip_last_dim(pshape), + ::ngraph::op::AutoBroadcastType::NUMPY); + } + } else { + const auto pshape = input.get_partial_shape(); + if (pshape.rank().is_static() && pshape.size() > 2) { + ov::PartialShape::broadcast_merge_into(new_output_shape, + skip_last_dim(pshape), + ::ngraph::op::AutoBroadcastType::NUMPY); + } + } + } + + if (!broadcast_nodes.empty()) { + if (new_output_shape == skip_last_dim(interm_op->get_output_partial_shape(0))) { + std::copy(broadcast_nodes.begin(), broadcast_nodes.end(), std::back_inserter(ordered_ops)); + } + } +} + +auto tokenize_reshape_around_softmax(std::shared_ptr& interm_op, + std::shared_ptr& reshape, + ngraph::NodeVector& ordered_ops) -> bool { + reshape = ngraph::as_type_ptr(interm_op); + if (reshape) { + const auto shape = reshape->get_input_shape(0); + if (shape.back() != reshape->get_output_shape(0).back() || reshape->get_output_target_inputs(0).size() != 1) + return false; + ordered_ops.push_back(reshape); + interm_op = reshape->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + } + return true; +}; + +auto update_intermediate_supported_ops(std::shared_ptr& interm_op, ngraph::NodeVector& ordered_ops) -> bool { + // TODO: Add Reshape, FQ support + while (is_supported_op(interm_op)) { + // All supported intermediate ops have only one output port + // To verify output element type is enough because all supported intermediate ops have the same output element type as input type + if (interm_op->get_output_target_inputs(0).size() != 1 || !is_supported_tensor(interm_op->get_output_tensor(0))) + return false; + + // Check for supported Broadcast op + if (interm_op->get_input_size() > 1) { + tokenize_broadcast(interm_op, ordered_ops); + } + + ordered_ops.push_back(interm_op); + interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + } + return true; +}; +} // namespace + +ngraph::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets() { + MATCHER_SCOPE(TokenizeMHASnippets); + + auto m_matmul0 = std::make_shared(ngraph::pattern::any_input(ngraph::pattern::has_static_shape()), + ngraph::pattern::any_input(ngraph::pattern::has_static_shape())); + + register_matcher(std::make_shared(m_matmul0, matcher_name), + [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TokenizeMHASnippets") + auto& pattern_to_output = m.get_pattern_value_map(); + + // After some transformations, a different number of Constants for some operations may be created + // than the actual number of Constants during tokenization. + // To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation) + // we should calculate potential number of non-scalar Constants that will be moved up from body. + // TODO: Need update this variable when FQ will be supported + size_t hidden_virtual_ports_count = 0; + // Default value is True because MHA pattern always requires Buffer op + bool need_buffer = true; + std::string fused_names; + ngraph::NodeVector ordered_ops; + + /* ======== Matcher Pass ========== */ + + /****** Skeleton ******/ + /* Skeleton on MHA-pattern is: + * \ / + * MatMul0 + * | + * Eltwise/Select/Reshape/FakeQuantize + * | + * Softmax + * | + * Eltwise/Select/Reshape/FakeQuantize + * \ / + * MatMul1 + */ + const auto matmul0 = ngraph::as_type_ptr(pattern_to_output.at(m_matmul0).get_node_shared_ptr()); + if (!matmul0 || matmul0->get_output_target_inputs(0).size() != 1 || matmul0->get_transpose_a() || + !is_supported_tensor(matmul0->get_input_tensor(0)) || !is_supported_tensor(matmul0->get_input_tensor(1))) + return false; + + if (transformation_callback(matmul0)) { + return false; + } + + ordered_ops.push_back(matmul0); + + auto interm_op = matmul0->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + // Add supported operations which are between MatMul0 and Softmax to ordered_ops + if (!update_intermediate_supported_ops(interm_op, ordered_ops)) + return false; + + std::shared_ptr reshape0 = nullptr; + if (!tokenize_reshape_around_softmax(interm_op, reshape0, ordered_ops)) + return false; + + int64_t axis = 0; + const auto rank = interm_op->get_input_partial_shape(0).rank(); + if (const auto softmax_v8 = ngraph::as_type_ptr(interm_op)) { + axis = ngraph::normalize_axis(interm_op->get_friendly_name(), softmax_v8->get_axis(), rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(interm_op)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + + if (axis != rank.get_length() - 1 || interm_op->get_output_target_inputs(0).size() != 1) + return false; + ordered_ops.push_back(interm_op); + + interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + std::shared_ptr reshape1 = nullptr; + if (!tokenize_reshape_around_softmax(interm_op, reshape1, ordered_ops)) + return false; + + if (((reshape0 == nullptr) != (reshape1 == nullptr)) || + (reshape0 && reshape1 && (reshape0->get_input_shape(0) != reshape1->get_output_shape(0)))) + return false; + + // Add supported operations which are between Softmax and MatMul1 to ordered_ops + if (!update_intermediate_supported_ops(interm_op, ordered_ops)) + return false; + + const auto matmul1 = ngraph::as_type_ptr(interm_op); + if (!matmul1 || matmul1->get_output_target_inputs(0).size() != 1 || matmul1->get_transpose_a() || matmul1->get_transpose_b() || + !is_supported_tensor(matmul1->get_input_tensor(0)) || !is_supported_tensor(matmul1->get_input_tensor(1))) + return false; + + /***********************/ + + /***** Transposes *****/ + /* There may be Transpose and Reshape ops on inputs and outputs of MHA-pattern skeleton + * We can add them into Subgraph body + */ + + // First input branch of MatMul0 should be executed before second input branch of MatMul0, + // so firstly we insert Transpose1 on the beginning of ordered_ops and then Transpose1 + bool are_weights_scalar = true; + auto parent = matmul0->get_input_node_shared_ptr(1); + while (is_supported_op(parent)) { + // All supported ops have only one output port + // To verify output element type is enough because all supported ops have the same output element type as input type + if (parent->get_output_target_inputs(0).size() != 1 || !is_supported_tensor(parent->get_output_tensor(0))) + break; + + const auto parent_count = parent->inputs().size(); + for (size_t i = 1; i < parent_count; ++i) { + are_weights_scalar = are_weights_scalar && ngraph::shape_size(parent->get_input_shape(i)) == 1; + } + ordered_ops.insert(ordered_ops.begin(), parent); + // We think that sequence of ops goes through input port 0 + // But can be Select here? If it can be, parent shouldn't be on input port 0. Need another way? + parent = parent->get_input_node_shared_ptr(0); + } + + auto transpose1 = ngraph::as_type_ptr(parent); + if (matmul0->get_transpose_b()) { + if (is_valid_transpose(transpose1, {0, 2, 1, 3})) { + // We can support several ops between MatMul0 with transposed_b and Transpose1 with 0213 order + // only if these ops have scalar shapes on other inputs. + // There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false). + // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching + if (are_weights_scalar) { + ordered_ops.insert(ordered_ops.begin(), transpose1); + } else { + return false; + } + } else { + return false; + } + } else { + if (is_valid_transpose(transpose1, {0, 2, 3, 1})) { + ordered_ops.insert(ordered_ops.begin(), transpose1); + } + } + + // TODO: Add Reshape Support for all Transposes + // Add 3D support for all Transposes + const auto transpose0 = ngraph::as_type_ptr(matmul0->get_input_node_shared_ptr(0)); + if (is_valid_transpose(transpose0, {0, 2, 1, 3})) { + ordered_ops.insert(ordered_ops.begin(), transpose0); + } else if (matmul0->get_transpose_b()) { + return false; + } + + const auto transpose2 = ngraph::as_type_ptr(matmul1->get_input_node_shared_ptr(1)); + if (is_valid_transpose(transpose2, {0, 2, 1, 3})) { + ordered_ops.push_back(transpose2); + } + ordered_ops.push_back(matmul1); + + auto child = matmul1->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); + // TODO: Add support Eltwises between MatMul1 and Transpose + // status = update_intermediate_supported_ops(child, ordered_ops); + // if (!status) { + // ordered_ops.push_back(child); + // } + + auto transpose3 = ngraph::as_type_ptr(child); + if (is_valid_transpose(transpose3, {0, 2, 1, 3})) { + ordered_ops.push_back(transpose3); + } + + /**********************/ + + /* ================================ */ + + /* ====== Subgraph creation ======= */ + + ngraph::OutputVector body_inputs, subgraph_inputs; + ngraph::ParameterVector body_parameters; + ngraph::ResultVector body_results; + std::vector>> subgraph_result_inputs; + + auto create_body_inputs = [&](const std::shared_ptr& node) -> void { + for (size_t i = 0; i < node->get_input_size(); ++i) { + const auto input = node->input(i); + const auto parent = input.get_source_output().get_node_shared_ptr(); + const auto constant = ov::as_type_ptr(parent); + if (constant && (ngraph::shape_size(input.get_shape()) == 1 || op::Subgraph::constant_input_should_be_inside_body(node))) { + // If Constant has one consumer - target node, we add Constant to body_inputs + // If Constant has several consumers, we should check that all these consumers are inside Subgraph body + // and if all of them are inside body, we can explicitly add Constant to the body_inputs, otherwise we should + // make a copy and add copy of Constant to body_inputs + // For example, this case is especially valid for Transposes nodes + // (several Transposes have the same order so there can be the common Constant with this order) + if (constant->get_output_target_inputs(0).size() == 1) { + body_inputs.push_back(input.get_source_output()); + } else { + const auto constant_consumers = constant->get_output_target_inputs(0); + bool all_consumers_are_inside = std::all_of(constant_consumers.begin(), constant_consumers.end(), + [&ordered_ops](const ngraph::Input& input) { + return std::find(ordered_ops.begin(), ordered_ops.end(), + input.get_node()->shared_from_this()) != ordered_ops.end(); + }); + if (all_consumers_are_inside) { + body_inputs.push_back(input.get_source_output()); + } else { + const auto constant_copy = constant->clone_with_new_inputs({}); + node->set_argument(input.get_index(), constant_copy); + body_inputs.push_back(constant_copy); + } + } + } else if (std::find(ordered_ops.begin(), ordered_ops.end(), parent) == ordered_ops.end()) { + auto parameter = std::make_shared(input.get_element_type(), input.get_partial_shape()); + body_parameters.push_back(parameter); + body_parameters.back()->set_friendly_name(input.get_node()->get_friendly_name()); + body_inputs.push_back(parameter->output(0)); + + subgraph_inputs.push_back(input.get_source_output()); + + node->input(i).replace_source_output(parameter); + } + } + }; + + for (const auto& op : ordered_ops) { + create_body_inputs(op); + op->clear_control_dependencies(); + fused_names += op->get_friendly_name() + ","; + } + + const auto last_node = ordered_ops.back(); + for (const auto& output : last_node->outputs()) { + subgraph_result_inputs.push_back(output.get_target_inputs()); + } + for (const auto& output : last_node->outputs()) { + body_results.push_back(std::make_shared(last_node->output(output.get_index()))); + } + + if (body_results.size() != subgraph_result_inputs.size()) { + throw ngraph_error("body results and node results size mismatch during subgraph collapse"); + } + + // todo: move this plugin-specific constraint to the plugin callback + if (body_parameters.size() + body_results.size() + hidden_virtual_ports_count > 12) { + return false; + } + + auto body = op::create_body(last_node->get_friendly_name(), body_results, body_parameters); + auto subgraph = std::make_shared(subgraph_inputs, body); + // Copy runtime info from last node to subgraph - to copy topological order + copy_runtime_info(last_node, subgraph); + subgraph->set_friendly_name(last_node->get_friendly_name()); + + for (size_t i = 0; i < subgraph->get_output_size(); ++i) { + for (const auto& target_input : subgraph_result_inputs[i]) { + target_input.replace_source_output(subgraph->output(i)); + } + } + op::update_out_tensor_name(subgraph); + + subgraph->validate_and_infer_types(); + + auto act_body = subgraph->body_ptr(); + for (size_t i = 0; i < act_body->get_parameters().size(); i++) { + act_body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name()); + } + subgraph->get_rt_info()["originalLayersNames"] = fused_names; + subgraph->set_virtual_port_count(hidden_virtual_ports_count); + subgraph->set_buffer_needed(need_buffer); + + return true; + + /* ================================ */ + }); +} diff --git a/src/common/snippets/src/pass/reset_buffer.cpp b/src/common/snippets/src/pass/reset_buffer.cpp new file mode 100644 index 00000000000..bae2ac58ccd --- /dev/null +++ b/src/common/snippets/src/pass/reset_buffer.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "snippets/snippets_isa.hpp" +#include "snippets/pass/reset_buffer.hpp" +#include "snippets/op/subgraph.hpp" + + +namespace { +void normalize_ptr_and_offsets(const ov::NodeVector &io, std::vector &ptr_increments, std::vector &finalization_offsets) { + bool there_is_buffer = false; + // Iterations are from end because before we correct finalization offsets for Loop outputs (io = inputs + outputs) + for (int i = static_cast(io.size()) - 1; i >= 0; --i) { + if (ov::is_type(io[i])) { + if (there_is_buffer) { + ptr_increments[i] = 0; + finalization_offsets[i] = 0; + } else { + there_is_buffer = true; + } + } + } +} +} // namespace + +int64_t ngraph::snippets::pass::ResetBufferState::calculate_required_finalization_offsets(const size_t back_step, const size_t target_work_amount) { + return target_work_amount != 1 ? -static_cast(back_step) : 0; +} + +ngraph::snippets::pass::ResetBufferState::ResetBufferState() { + MATCHER_SCOPE(ResetBufferState); + + // Match on LoopEnd is enough at the moment because Buffer op may be only after MatMul and LoopEnd, but + // MatMul doesn't change Buffer memory pointer after execution + auto m_loop_end = ngraph::pattern::wrap_type(); + + auto callback = [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ResetBufferState") + auto& pattern_to_output = m.get_pattern_value_map(); + + const auto loop_end = ngraph::as_type_ptr(pattern_to_output.at(m_loop_end).get_node_shared_ptr()); + const auto loop_begin = loop_end->get_loop_begin(); + + const auto i_size = loop_begin->get_input_size(); + const auto o_size = loop_end->get_output_size(); + const auto count_io = i_size + o_size; + std::vector body_shapes(count_io); + ov::NodeVector io(count_io); + for (size_t i = 0; i < i_size; ++i) { + body_shapes[i] = loop_begin->input_value(i).get_partial_shape(); + io[i] = loop_begin->input_value(i).get_node_shared_ptr(); + auto port_idx = loop_begin->input_value(i).get_index(); + while (std::dynamic_pointer_cast(io[i])) { + const auto source_output = io[i]->input_value(port_idx); + io[i] = source_output.get_node_shared_ptr(); + port_idx = source_output.get_index(); + } + } + for (size_t i = 0; i < o_size; ++i) { + body_shapes[i_size + i] = loop_end->output(i).get_partial_shape(); + // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op + auto consumer = *loop_end->output(i).get_target_inputs().begin(); + auto port_idx = consumer.get_index(); + io[i_size + i] = consumer.get_node()->shared_from_this(); + while (std::dynamic_pointer_cast(io[i_size + i])) { + auto consumer = *io[i_size + i]->output(port_idx).get_target_inputs().begin(); + port_idx = consumer.get_index(); + io[i_size + i] = consumer.get_node()->shared_from_this(); + } + } + + auto ptr_increments = loop_end->get_ptr_increments(); + auto finalization_offsets = loop_end->get_finalization_offsets(); + + // If after Loop there is immediately Buffer, we should reset the Buffer ptr for the next calculations + for (size_t i = 0; i < o_size; ++i) { + const auto result_shape = body_shapes[i_size + i].get_shape(); + // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op + const auto consumer = loop_end->output(i).get_target_inputs().begin()->get_node(); + if (ov::is_type(consumer)) { + // To calculate finalization offset we should know index of nesting Loop + auto loop_index = 0lu; + auto loop = loop_end->input_value(i).get_node_shared_ptr(); + auto port_idx = loop_end->input_value(i).get_index(); + while (std::dynamic_pointer_cast(loop)) { + const auto source_output = loop->input_value(port_idx); + loop = source_output.get_node_shared_ptr(); + port_idx = source_output.get_index(); + loop_index++; + } + + const auto work_amount = std::accumulate(result_shape.rbegin(), result_shape.rbegin() + loop_index + 1, size_t(1), std::multiplies()); + finalization_offsets[i_size + i] = + calculate_required_finalization_offsets(work_amount, *(result_shape.rbegin() + loop_index)); + } + } + + // If there are several Buffers on I/O we should remember that all Buffer have the register, + // so we should update ptr for only one Buffer + normalize_ptr_and_offsets(io, ptr_increments, finalization_offsets); + loop_end->set_finalization_offsets(finalization_offsets); + loop_end->set_ptr_increments(ptr_increments); + + return true; + }; + + auto m = std::make_shared(m_loop_end, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp new file mode 100644 index 00000000000..1a7330fb537 --- /dev/null +++ b/src/common/snippets/src/pass/softmax_decomposition.cpp @@ -0,0 +1,216 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/remarks.hpp" +#include + +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/pass/reset_buffer.hpp" +#include "snippets/pass/insert_loops.hpp" +#include "snippets/pass/loop_helpers.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include +#include +#include +#include + + +ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank) { + MATCHER_SCOPE(SoftmaxDecomposition); + + auto m_softmax = ngraph::pattern::wrap_type(); + + auto callback = [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxDecomposition") + auto root = m.get_match_root(); + const auto master_pshape = root->get_input_partial_shape(0); + const auto rank = master_pshape.rank(); + if (rank.is_dynamic() || master_pshape.is_dynamic()) + return false; + + int64_t axis = 0; + if (const auto softmax_v8 = ngraph::as_type_ptr(root)) { + axis = ngraph::normalize_axis(root->get_friendly_name(), softmax_v8->get_axis(), rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(root)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + + const auto shape_rank = rank.get_length(); + if (axis != shape_rank - 1) + return false; + + const auto data = root->get_input_node_shared_ptr(0); + + const auto master_shape = master_pshape.get_shape(); + const auto dimension = shape_rank - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = vector_size; + const auto inner_dim = shape_rank - 1; + const auto inner_master_work_amount = static_cast(master_shape[inner_dim]); + const int outer_dim = shape_rank > 1 ? static_cast(shape_rank - 2) : -1; + const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1; + + /* ====== ReduceMax decomposition ====== */ + + // We have to have fake edge Data -> Loop[ReduceMax] -> Loop[Sub + Exp + ReduceSum] because ReduceMax is + // accumulator which finds maximum of elements and save it to vector register. Loop works only with GPR (data) but ReduceMax Loop + // doesn't save maximum to data. Seems like, LoopEnd shouldn't have outputs: + // Data + // VectorBuffer LoopBegin \ + // \ Load \ | + // Maximum / | + // / LoopEnd | + // HorizonMax / + // \ LoopBegin[Sub + Exp + ReduceSum] + // But nGraph doesn't allow to have 0 outputs for Node (at least 1 output). + // Thus, we propagate data through Loop[ReduceMax] using fake edge because of that Loop[ReduceMax] has two inputs "Data" + // Data + // VectorBuffer LoopBegin + // \ Load | \ + // Maximum | / + // / LoopEnd + // HorizonMax | + // \ LoopBegin[Sub + Exp + ReduceSum] + const auto vector_buffer_max = std::make_shared(); + const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data}); + + const auto load_max = std::make_shared(loop_max_begin->output(0), increment); + const auto max = std::make_shared(load_max, vector_buffer_max); + + auto apply_increments_max = + InsertLoops::calculate_inner_apply_increments(master_shape, {data->get_shape(), data->get_shape(), data->get_shape()}); + // Input of softmax is Input and Output of this loop, which isn't used inside (it's just to have one output in Loop at least) + // So we shouldn't increment pointer after each loop iteration + apply_increments_max[0] = false; + apply_increments_max[1] = false; + // we should always reset data ptr after this loop because in the next Loop this ptr is used + // Although output isn't a Buffer op, we set finalization offset and ptr increment for output, because ResetBufferState pass + // normalizes offsets and increments starting from outputs + const auto finalization_offsets_max = + std::vector{ 0, 0, ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, data->get_shape()[inner_dim]) }; + const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, + work_amount, increment, apply_increments_max, finalization_offsets_max); + + const auto horizon_max = std::make_shared(max); + + /* =========================================== */ + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)}); + + const auto load_sub = std::make_shared(loop_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + const auto exp = std::make_shared(sub); + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + auto apply_increments_sum = + InsertLoops::calculate_inner_apply_increments(master_shape, {load_sub->get_shape(), store_exp->get_shape()}); + std::vector finalization_offsets_sum(2, 0); + if (has_outer_loop) { + finalization_offsets_sum = + InsertLoops::calculate_finalization_offsets(master_shape, {load_sub->get_shape(), store_exp->get_shape()}); + } + // we should always reset buffer ptr after loop because in the next Loop this buffer ptr is used + finalization_offsets_sum[1] = ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, store_exp->get_shape()[inner_dim]); + const auto loop_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment, + apply_increments_sum, finalization_offsets_sum); + + const auto horizon_sum = std::make_shared(sum); + const auto buffer_exp = std::make_shared(loop_sum_end->output(0), buffer_allocation_rank); + + /* =========================================== */ + + /* ================== Div ==================== */ + + // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop + const auto pow = std::make_shared(horizon_sum, + ngraph::op::Constant::create(ov::element::f32, ngraph::Shape{}, {-1})); + + const auto loop_div_begin = op::insertLoopBegin(ngraph::OutputVector{buffer_exp}); + + const auto load_div = std::make_shared(loop_div_begin->output(0), increment); + const auto mul = std::make_shared(load_div, pow); + const auto store_div = std::make_shared(mul, increment); + + auto apply_increments_div = + InsertLoops::calculate_inner_apply_increments(master_shape, {load_div->get_shape(), store_div->get_shape()}); + std::vector finalization_offsets_div(2, 0); + if (has_outer_loop) { + finalization_offsets_div = + InsertLoops::calculate_finalization_offsets(master_shape, {load_div->get_shape(), store_div->get_shape()}); + } + const auto loop_div_end = std::make_shared( + ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment, + apply_increments_div, finalization_offsets_div); + + /* =========================================== */ + + /* ========== Control dependency ============= */ + + loop_max_begin->add_control_dependency(vector_buffer_max); + loop_max_end->add_control_dependency(max); + horizon_max->add_control_dependency(loop_max_end); + loop_sum_begin->add_control_dependency(vector_buffer_sum); + loop_sum_begin->add_control_dependency(horizon_max); + loop_sum_end->add_control_dependency(sum); + horizon_sum->add_control_dependency(loop_sum_end); + loop_div_begin->add_control_dependency(horizon_sum); + loop_div_begin->add_control_dependency(pow); + + /* =========================================== */ + + /* ============= Runtime Info ================ */ + + // For tail loop we should fill input of Max by float min and + // input of Sum by zero to avoid math incorrect calculations + max->input(0).get_rt_info()["set_fill"] = uint32_t(0xff7fffff); + sum->input(0).get_rt_info()["set_fill"] = uint32_t(0x00000000); + + // These nodes should be executed outside loops + ov::NodeVector ops_outside_loop = { vector_buffer_max, horizon_max, vector_buffer_sum, horizon_sum, pow, buffer_exp }; + for (const auto& op : ops_outside_loop) { + op->get_rt_info()["outside_loop"] = true; + } + + ngraph::copy_runtime_info(root, + {vector_buffer_max, loop_max_begin, load_max, max, horizon_max, loop_max_end, + vector_buffer_sum, loop_sum_begin, load_sub, sub, exp, sum, store_exp, horizon_sum, loop_sum_end, buffer_exp, pow, + loop_div_begin, load_div, mul, store_div, loop_div_end}); + + /* =========================================== */ + + ngraph::replace_node(root, loop_div_end); + + /* ============== Outer loop ================= */ + if (has_outer_loop) { + std::vector apply_increments = + InsertLoops::calculate_outer_apply_increments({root->get_input_shape(0), root->get_output_shape(0)}); + const auto softmax_parameters = + std::vector>{loop_max_begin->input(0).get_source_output()}; + const auto output_set = loop_div_end->output(0).get_target_inputs(); + const auto softmax_results = std::vector>{output_set.begin(), output_set.end()}; + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(softmax_parameters); + const auto outer_loop_end = ngraph::snippets::op::insertLoopEndBeforeInputs( + softmax_results, outer_loop_begin, master_shape[outer_dim], 1, apply_increments); + + vector_buffer_max->add_control_dependency(outer_loop_begin); + + ngraph::copy_runtime_info(root, {outer_loop_begin, outer_loop_end}); + } + /* =========================================== */ + + return true; + }; + + auto m = std::make_shared(m_softmax, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/snippets/src/pass/softmax_reshape_elimination.cpp b/src/common/snippets/src/pass/softmax_reshape_elimination.cpp new file mode 100644 index 00000000000..f770f4e8066 --- /dev/null +++ b/src/common/snippets/src/pass/softmax_reshape_elimination.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/remarks.hpp" + +#include "snippets/pass/softmax_reshape_elimination.hpp" +#include "snippets/snippets_isa.hpp" + +#include +#include +#include + +ngraph::snippets::pass::SoftmaxReshapeElimination::SoftmaxReshapeElimination() { + MATCHER_SCOPE(SoftmaxReshapeElimination); + const auto m_reshape0 = pattern::wrap_type(pattern::has_static_shape()); + const auto m_softmax = pattern::wrap_type({m_reshape0}); + const auto m_reshape1 = pattern::wrap_type({m_softmax, pattern::wrap_type()}); + + register_matcher(std::make_shared(m_reshape1, matcher_name), + [=](ngraph::pattern::Matcher &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxReshapeElimination") + auto& pattern_to_output = m.get_pattern_value_map(); + auto reshape0 = pattern_to_output[m_reshape0].get_node_shared_ptr(); + auto softmax = pattern_to_output[m_softmax].get_node_shared_ptr(); + auto reshape1 = pattern_to_output[m_reshape1].get_node_shared_ptr(); + + const auto input_shape = reshape0->get_input_partial_shape(0); + const auto output_shape = reshape1->get_output_partial_shape(0); + if (input_shape.is_dynamic() || output_shape.is_dynamic() || input_shape.get_shape() != output_shape.get_shape()) + return false; + + const auto softmax_rank = softmax->get_input_partial_shape(0).rank(); + int64_t axis = 0; + if (const auto softmax_v8 = ngraph::as_type_ptr(softmax)) { + axis = ngraph::normalize_axis(softmax->get_friendly_name(), softmax_v8->get_axis(), softmax_rank); + } else if (const auto softmax_v1 = ngraph::as_type_ptr(softmax)) { + axis = softmax_v1->get_axis(); + } else { + return false; + } + + // Supports only last axis + if (axis != softmax_rank.get_length() - 1) + return false; + + // Dimensions by reduction axis should be equal + if (input_shape.get_shape().back() != softmax->get_input_shape(0).back()) + return false; + + // Eliminate Reshape before Softmax + reshape0->output(0).replace(reshape0->input_value(0)); + copy_runtime_info({reshape0->input_value(0).get_node_shared_ptr(), reshape0->output(0).get_node_shared_ptr()}, + reshape0->input_value(0).get_node_shared_ptr()); + + // Eliminate Reshape after Softmax with name saving + replace_output_update_name(reshape1->output(0), reshape1->input_value(0)); + + // update axis + const auto new_axis = input_shape.rank().get_length() - 1; + if (auto softmax_v8 = ngraph::as_type_ptr(softmax)) { + softmax_v8->set_axis(new_axis); + } else if (auto softmax_v1 = ngraph::as_type_ptr(softmax)) { + softmax_v1->set_axis(new_axis); + } + + return true; + }); +} diff --git a/src/common/snippets/src/pass/tokenization.cpp b/src/common/snippets/src/pass/tokenization.cpp new file mode 100644 index 00000000000..4744b73b882 --- /dev/null +++ b/src/common/snippets/src/pass/tokenization.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/pass/tokenization.hpp" +#include "snippets/pass/common_optimizations.hpp" + + +namespace ngraph { +namespace snippets { +namespace pass { + +void SetSnippetsNodeType(const std::shared_ptr &node, SnippetsNodeType nodeType) { + auto &rt = node->get_rt_info(); + rt["SnippetsNodeType"] = nodeType; +} + +SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr &node) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::GetSnippetsNodeType") + auto &rt = node->get_rt_info(); + const auto rinfo = rt.find("SnippetsNodeType"); + if (rinfo == rt.end()) + return SnippetsNodeType::NotSet; + return rinfo->second.as(); +} + +void SetTopologicalOrder(const std::shared_ptr &node, int64_t order) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::SetTopologicalOrder") + auto &rt = node->get_rt_info(); + rt["TopologicalOrder"] = order; +} + +int64_t GetTopologicalOrder(const std::shared_ptr &node) { + auto &rt = node->get_rt_info(); + const auto rinfo = rt.find("TopologicalOrder"); + if (rinfo == rt.end()) + throw ngraph_error("Topological order is required, but not set."); + return rinfo->second.as(); +} + +bool EnumerateNodes::run_on_model(const std::shared_ptr &m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::EnumerateNodes") + int64_t order = 0; + // Todo: We don't really have to set order for every node, just for subgraph parents and children would be enough + for (auto &node : m->get_ordered_ops()) { + SetTopologicalOrder(node, order++); + } + return true; +} + + +bool SnippetsTokenization::run_on_model(const std::shared_ptr& m) { + RUN_ON_FUNCTION_SCOPE(SnippetsTokenization); + ngraph::pass::Manager manager(get_pass_config()); + manager.set_per_pass_validation(false); + + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.run_passes(m); + + // Returning value is false because pass::Manager always apply Validation pass if function was changed. + // But we don't need to validate the model + return false; +} + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp new file mode 100644 index 00000000000..5dc6960b2fd --- /dev/null +++ b/src/common/snippets/src/pass/transpose_decomposition.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +const std::set> ngraph::snippets::pass::TransposeDecomposition::supported_cases = {{0, 2, 3, 1}}; +ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() { + MATCHER_SCOPE(TransposeDecomposition); + // todo: we need a special transformation that detects and propagates data access pattern to Parameters and Results + // this is needed to communicate access pattern to the plugin node and op::Kernel + // This is the reason we match only to Parameter, this limitation could be relaxed if we propagate access pattern + // to the appropriate parameter + auto match_data = ngraph::pattern::wrap_type(); + auto match_order = ngraph::pattern::wrap_type(); + auto match_transpose = ngraph::pattern::wrap_type({match_data, match_order}); + + ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransposeDecomposition") + auto& pattern_to_output = m.get_pattern_value_map(); + const auto transpose = ov::as_type_ptr( + pattern_to_output.at(match_transpose).get_node_shared_ptr()); + + const auto order = ov::as_type_ptr(pattern_to_output.at(match_order).get_node_shared_ptr()); + if (transformation_callback(transpose) || transpose->is_dynamic()) + return false; + + auto order_value = order->cast_vector(); + if (supported_cases.count(order_value) == 0) + return false; + + auto data_input = pattern_to_output.at(match_data); + const auto& data_node = pattern_to_output.at(match_data).get_node_shared_ptr(); + auto ¶m_rt = data_node->get_rt_info(); + // Note: store and usage inside emitters as size_t is more convenient, so static_cast here + const auto& access_pattern = order->cast_vector(); + param_rt["Layout"] = access_pattern; + + // The line below is Ok, since we ensured that transpose is static above + auto data_shape = data_input.get_shape(); + // dim indexes with respect to SRC + const auto dim_C_idx = data_shape.size() - 3; + const auto dim_H_idx = data_shape.size() - 2; + const auto dim_W_idx = data_shape.size() - 1; + const auto size_C = static_cast(data_shape[dim_C_idx]); + const auto size_W = static_cast(data_shape[dim_W_idx]); + const auto size_H = static_cast(data_shape[dim_H_idx]); + + auto loop_W_begin = std::make_shared(OutputVector{data_input}); + auto loop_C_begin = std::make_shared(OutputVector{loop_W_begin->output(0)}); + // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation + // fix this in future and develop a more consistent shape propagation approach. + auto load = std::make_shared(loop_C_begin->output(0), 1, 0, access_pattern); + auto store = std::make_shared(load, 1); + const std::vector ptr_increments_C {size_H * size_W, 1}; + const std::vector finalization_offsets_C {1 - size_H * size_W * size_C, 0}; + auto loop_C_end = std::make_shared(OutputVector{store->output(0), loop_C_begin->output(1)}, + size_C, 1, ptr_increments_C, finalization_offsets_C); + auto loop_W_end = std::make_shared(OutputVector{loop_C_end->output(0), loop_W_begin->output(1)}, + size_W, 1, std::vector{0, 0}, std::vector{0, 0}); + + for (auto& input : transpose->output(0).get_target_inputs()) { + input.replace_source_output(loop_W_end->output(0)); + } + + return true; + }; + + auto m = std::make_shared(match_transpose, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/snippets/src/utils.cpp b/src/common/snippets/src/utils.cpp index 9c38a17b2d5..11adf0fe954 100644 --- a/src/common/snippets/src/utils.cpp +++ b/src/common/snippets/src/utils.cpp @@ -6,8 +6,11 @@ #include "snippets/pass/fq_decomposition.hpp" +namespace ngraph { +namespace snippets { +namespace utils { -auto ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(const std::shared_ptr& fq) -> size_t { +auto get_non_scalar_constant_count_for_fq(const std::shared_ptr& fq) -> size_t { std::vector out_scales; std::vector cl, ch, isc, ish, osc, osh; const bool status = ngraph::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(fq, cl, ch, isc, ish, osc, osh); @@ -55,3 +58,54 @@ auto ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(const std::sh return 1; return 0; } +std::vector get_node_output_layout(const std::shared_ptr& node) { + return get_node_output_layout(node.get()); +} +std::vector get_node_output_layout(const Node* node) { + if (!node) + return {}; + if (node->is_dynamic()) + throw ngraph_error("It's illegal to call get_node_output_layout for dynamic nodes"); + auto &rt = node->get_rt_info(); + const auto rinfo = rt.find("Layout"); + if (rinfo != rt.end()) { + std::vector layout(rinfo->second.as>()); + // This might be a little costy, but still useful sanity check. Remove if proved to be unacceptably heavy. + std::set unique_elements(layout.begin(), layout.end()); + if (unique_elements.size() < layout.size()) + throw ngraph_error("Layout must contain only unique dimension indexes"); + return layout; + } else { + return {}; + } +} + +ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector& layout) { + if (layout.empty()) + return shape; + std::vector reordered_shape(layout.size()); + if (shape.rank().is_dynamic()) + throw ngraph_error("get_reordered_planar_shape can't be called for outputs with dynamic rank"); + const size_t rank = shape.rank().get_length(); + if (layout.size() > rank) + throw ngraph_error("Layout rank can't be larger than tensor rank"); + // Note that it can be smaller though, for example tensor shape can be prepended with 1 for scheduling purposes + if (std::any_of(layout.begin(), layout.end(), [=](size_t x) {return x >= rank;})) + throw ngraph_error("Invalid layout detected: all layout indexes must be smaller than the tensor rank"); + for (int i = 0; i < layout.size(); i++) + reordered_shape[i] = shape[layout[i]]; + return reordered_shape; +} + +ov::PartialShape get_port_planar_shape(const Output& out) { + std::vector layout = get_node_output_layout(out.get_node_shared_ptr()); + const auto& tensor = out.get_tensor_ptr(); + if (!tensor) + throw ngraph_error("get_port_planar_shape can't be called for an uninitialized output tensor"); + auto tensor_shape = tensor->get_partial_shape(); + return get_reordered_planar_shape(tensor_shape, layout); +} + +} // namespace utils +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp index 8e86d321e7e..c629b1c13f5 100644 --- a/src/common/snippets/tests/include/lowering_utils.hpp +++ b/src/common/snippets/tests/include/lowering_utils.hpp @@ -40,10 +40,16 @@ public: class LoweringTests : public TransformationTestsF { public: LoweringTests(); + + void SetUp() override; + void TearDown() override; + protected: static std::shared_ptr getSubgraph(const std::shared_ptr& f); - static std::shared_ptr getLoweredSubgraph(const std::shared_ptr& f); + static std::shared_ptr getLoweredSubgraph(const std::shared_ptr& f, + const ov::PartialShape& master_shape); static std::shared_ptr getTokenizedSubgraph(const std::shared_ptr& f); + ov::PartialShape master_shape{}; }; } // namespace snippets diff --git a/src/common/snippets/tests/include/pass/broadcast_to_movebroadcast.hpp b/src/common/snippets/tests/include/pass/broadcast_to_movebroadcast.hpp new file mode 100644 index 00000000000..15a1f5a9846 --- /dev/null +++ b/src/common/snippets/tests/include/pass/broadcast_to_movebroadcast.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "lowering_utils.hpp" +#include "snippets_helpers.hpp" + +namespace ov { +namespace test { +namespace snippets { +typedef std::tuple< + Shape, // Input shape 0 + Shape, // Input shape 1 + Shape // Broadcast shape +> BroadcastParams; + +class BroadcastToMoveBroadcastTests : public LoweringTests, public testing::WithParamInterface { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; + std::shared_ptr snippets_function; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp new file mode 100644 index 00000000000..8b886ef9876 --- /dev/null +++ b/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "lowering_utils.hpp" +#include "snippets_helpers.hpp" + +/* The main purpose is to test that FuseTransposeBrgemm properly fuses 0213 Transposes on both inputs, as well as on output + */ + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + std::vector, // Input shapes + PartialShape, // Master shape + size_t // Transpose position +> fuseTransposeBrgemmParams; + +class FuseTransposeBrgemmTests : public LoweringTests, public testing::WithParamInterface { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; + std::shared_ptr snippets_function; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/include/pass/mha_tokenization.hpp b/src/common/snippets/tests/include/pass/mha_tokenization.hpp new file mode 100644 index 00000000000..60e06d591ca --- /dev/null +++ b/src/common/snippets/tests/include/pass/mha_tokenization.hpp @@ -0,0 +1,20 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace test { +namespace snippets { + +class TokenizeMHASnippetsTests : public TransformationTestsF { +public: + virtual void run(); +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/include/pass/softmax_decomposition.hpp b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp new file mode 100644 index 00000000000..3943bd641bf --- /dev/null +++ b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "lowering_utils.hpp" +#include "snippets_helpers.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + Shape, // Input shape 0 + int // Axis +> SoftmaxParams; + +typedef std::tuple< + Shape, // Input shape 0 + Shape, // Input shape 1 + int // Axis +> AddSoftmaxParams; + +class SoftmaxTests : public LoweringTests, public testing::WithParamInterface { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; + std::shared_ptr snippets_function; +}; + +class AddSoftmaxTests : public LoweringTests, public testing::WithParamInterface { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; + std::shared_ptr snippets_function; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index de46de861ca..a07fb4c0884 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -4,7 +4,7 @@ #include #include "lowering_utils.hpp" -#include "snippets/pass/collapse_subgraph.hpp" +#include "snippets/pass/tokenization.hpp" namespace ov { @@ -21,7 +21,12 @@ DummyTargetMachine::DummyTargetMachine() { jitters[op::v1::Add::get_type_info_static()] = dummy_functor; jitters[op::v1::Subtract::get_type_info_static()] = dummy_functor; jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor; - jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor; + jitters[op::v1::Divide::get_type_info_static()] = dummy_functor; + jitters[op::v1::Maximum::get_type_info_static()] = dummy_functor; + jitters[op::v0::Exp::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::PowerStatic::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor; @@ -30,8 +35,12 @@ DummyTargetMachine::DummyTargetMachine() { jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::Brgemm::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::Fill::get_type_info_static()] = dummy_functor; } LoweringTests::LoweringTests() : TransformationTestsF() { @@ -41,6 +50,29 @@ LoweringTests::LoweringTests() : TransformationTestsF() { comparator.disable(FunctionsComparator::CmpValues::SUBGRAPH_DESCRIPTORS); } +void LoweringTests::SetUp() { + manager.register_pass(); +} + +void LoweringTests::TearDown() { + auto cloned_function = ngraph::clone_function(*function); + if (!function_ref) { + function_ref = cloned_function; + } + manager.run_passes(function); + ASSERT_NO_THROW(check_rt_info(function)); + + if (comparator.should_compare(FunctionsComparator::ACCURACY)) { + auto acc_comparator = FunctionsComparator::no_default(); + acc_comparator.enable(FunctionsComparator::CmpValues::ACCURACY); + auto res = acc_comparator.compare(function, cloned_function); + ASSERT_TRUE(res.valid) << res.message; + comparator.disable(FunctionsComparator::CmpValues::ACCURACY); + } + auto res = comparator.compare(function, function_ref); + ASSERT_TRUE(res.valid) << res.message; +} + std::shared_ptr LoweringTests::getSubgraph(const std::shared_ptr& f) { std::shared_ptr subgraph; for (const auto &op : f->get_ops()) { @@ -59,9 +91,30 @@ std::shared_ptr LoweringTests::getSubgraph(const return subgraph; } -std::shared_ptr LoweringTests::getLoweredSubgraph(const std::shared_ptr &f) { +std::shared_ptr LoweringTests::getLoweredSubgraph(const std::shared_ptr &f, + const ov::PartialShape& master_shape) { auto subgraph = getTokenizedSubgraph(f); subgraph->set_generator(std::make_shared()); + subgraph->set_master_shape(master_shape); + const auto& body = subgraph->body_ptr(); + auto& body_rt_info = body->get_rt_info(); + // todo: insertLoops pass requires body_rt_info["PluginShapesOverride"] and subgraph->set_tile_rank to work normally + // consider revising snippets-plugin shape and scheduling communication + std::vector> new_shapes; + for (const auto& p : body->get_parameters()) { + const auto pshape = p->get_output_partial_shape(0); + if (pshape.is_dynamic()) + IE_THROW() << "getLoweredSubgraph supports only static shapes"; + new_shapes.push_back(pshape.get_shape()); + } + for (const auto& r : body->get_results()) { + const auto pshape = r->get_input_partial_shape(0); + if (pshape.is_dynamic()) + IE_THROW() << "getLoweredSubgraph supports only static shapes"; + new_shapes.push_back(pshape.get_shape()); + } + body_rt_info["PluginShapesOverride"] = new_shapes; + subgraph->set_tile_rank(2); subgraph->generate(); return subgraph; } diff --git a/src/common/snippets/tests/src/pass/broadcast_to_movebroadcast.cpp b/src/common/snippets/tests/src/pass/broadcast_to_movebroadcast.cpp new file mode 100644 index 00000000000..eec9fddf0f4 --- /dev/null +++ b/src/common/snippets/tests/src/pass/broadcast_to_movebroadcast.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "pass/broadcast_to_movebroadcast.hpp" +#include "common_test_utils/common_utils.hpp" +#include + +namespace ov { +namespace test { +namespace snippets { + + +std::string BroadcastToMoveBroadcastTests::getTestCaseName(testing::TestParamInfo obj) { + std::vector inputShapes(2); + Shape broadcast_shape; + std::tie(inputShapes[0], inputShapes[1], broadcast_shape) = obj.param; + std::ostringstream result; + for (size_t i = 0; i < inputShapes.size(); i++) + result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_"; + result << "BS=" << CommonTestUtils::vec2str(broadcast_shape) << "_"; + return result.str(); +} + +void BroadcastToMoveBroadcastTests::SetUp() { + TransformationTestsF::SetUp(); + std::vector inputShapes(2); + PartialShape broadcast_shape; + std::tie(inputShapes[0], inputShapes[1], broadcast_shape) = this->GetParam(); + snippets_function = std::make_shared(inputShapes, broadcast_shape); + master_shape = {}; + for (int i = 0; i < inputShapes[0].size(); i++) + master_shape.push_back(static_cast(std::max(inputShapes[0].get_shape()[i], inputShapes[1].get_shape()[i]))); +} + +TEST_P(BroadcastToMoveBroadcastTests, BroadcastSelect) { + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); + function = subgraph->body_ptr(); + function_ref = snippets_function->getLowered(); +} + +namespace BroadcastToMoveBroadcastTestsInstantiation { +using ov::Shape; +std::vector inputShapes0 {{1, 8, 2, 10}, {1, 8, 2, 1}, {1, 1, 1, 1}}; +std::vector inputShapes1 {{1, 8, 2, 10}, {1, 8, 2, 1}, {1, 1, 1, 1}}; +Shape broadcastShape {1, 8, 2, 10}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Broadcast, BroadcastToMoveBroadcastTests, + ::testing::Combine( + ::testing::ValuesIn(inputShapes0), + ::testing::ValuesIn(inputShapes1), + ::testing::Values(broadcastShape)), + BroadcastToMoveBroadcastTests::getTestCaseName); +} // namespace BroadcastToMoveBroadcastTestsInstantiation +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/tests/src/pass/canonicalization.cpp b/src/common/snippets/tests/src/pass/canonicalization.cpp index 15c33e6df96..7b687bad226 100644 --- a/src/common/snippets/tests/src/pass/canonicalization.cpp +++ b/src/common/snippets/tests/src/pass/canonicalization.cpp @@ -23,12 +23,12 @@ std::string CanonicalizationTests::getTestCaseName(testing::TestParamInfoGetParam(); input_blocked_shapes = {std::get<1>(inputs[0]), std::get<1>(inputs[1])}; - snippets_function = std::make_shared(std::vector{std::get<0>(inputs[0]), std::get<0>(inputs[1])}); + snippets_function = std::make_shared(std::vector{std::get<0>(inputs[0]), std::get<0>(inputs[1])}); } TEST_P(CanonicalizationTests, Add) { @@ -50,8 +50,9 @@ TEST_P(CanonicalizationTests, Add) { function_ref = snippets_function->getReference(); auto subgraph = getTokenizedSubgraph(function); subgraph->set_generator(std::make_shared()); - Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes); - ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape); + auto canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes); + ASSERT_TRUE(canonical_output_shape.is_static()); + ASSERT_DIMS_EQ(canonical_output_shape.get_shape(), expected_output_shape); } namespace CanonicalizationTestsInstantiation { diff --git a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp index f5be10838d6..cc4394c5ad3 100644 --- a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp @@ -6,7 +6,7 @@ #include #include #include -#include "snippets/pass/collapse_subgraph.hpp" +#include "snippets/pass/tokenization.hpp" namespace ov { namespace test { @@ -17,59 +17,64 @@ void CollapseSubgraphTests::run() { std::string name; manager.register_pass(); manager.register_pass(); + // todo: This is a temporary work-around. remove when MatMul tokenization is supported through general pipeline + manager.get_pass_config()->set_callback( + [](const std::shared_ptr& n) -> bool { + return ov::is_type(n); + }); } TEST_F(CollapseSubgraphTests, smoke_Snippets_Eltwise) { - const auto &f = EltwiseFunction(std::vector {{2, 3}, {1, 3}}); + const auto &f = EltwiseFunction(std::vector {{2, 3}, {1, 3}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_MatMulWithEltwise) { - const auto &f = MatMulEltwiseBranchesFunction(std::vector {{1, 3, 4, 4}, {1, 3, 4, 4}}); + const auto &f = MatMulEltwiseBranchesFunction(std::vector {{1, 3, 4, 4}, {1, 3, 4, 4}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) { - const auto &f = EltwiseLogLoopFunction(std::vector {{2, 5}, {2, 1}}); + const auto &f = EltwiseLogLoopFunction(std::vector {{2, 5}, {2, 1}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) { - const auto &f = ConvertFunction(std::vector{{2, 5}}); + const auto &f = ConvertFunction(std::vector{{2, 5}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) { - const auto &f = ConvertInputFunction(std::vector{{2, 5}, {1, 5}}); + const auto &f = ConvertInputFunction(std::vector{{2, 5}, {1, 5}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) { - const auto &f = ConvertOutputFunction(std::vector{{2, 5}, {1, 5}}); + const auto &f = ConvertOutputFunction(std::vector{{2, 5}, {1, 5}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) { - const auto &f = ConvertStubFunction(std::vector{{2, 5, 2}, {1, 5, 1}}); + const auto &f = ConvertStubFunction(std::vector{{2, 5, 2}, {1, 5, 1}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) { - const auto &f = ConvertPartialInputsAndResultsFunction(std::vector{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}}, + const auto &f = ConvertPartialInputsAndResultsFunction(std::vector{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}}, std::vector{ov::element::i8, ov::element::bf16, ov::element::f32}, std::vector{ov::element::f32, ov::element::i8}); function = f.getOriginal(); @@ -78,7 +83,7 @@ TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) { } TEST_F(CollapseSubgraphTests, smoke_Snippets_EltwiseTwoResultsFunction) { - const auto &f = EltwiseTwoResultsFunction(std::vector{{2, 5}, {2, 1}}); + const auto &f = EltwiseTwoResultsFunction(std::vector{{2, 5}, {2, 1}}); function = f.getOriginal(); function_ref = f.getReference(); comparator.enable(FunctionsComparator::CmpValues::NAMES); diff --git a/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp new file mode 100644 index 00000000000..22936ca0c62 --- /dev/null +++ b/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp @@ -0,0 +1,58 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "pass/fuse_transpose_brgemm.hpp" +#include "common_test_utils/common_utils.hpp" +#include "subgraph_matmul.hpp" +#include "subgraph_lowered.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string FuseTransposeBrgemmTests::getTestCaseName(testing::TestParamInfo obj) { + std::vector input_shapes(2); + PartialShape master_shape; + size_t transpose_position; + std::tie(input_shapes, master_shape, transpose_position) = obj.param; + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_"; + result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_"; + result << "MS=" << CommonTestUtils::partialShape2str({master_shape}) << "_"; + result << "Pos=" << transpose_position << "_"; + return result.str(); +} + +void FuseTransposeBrgemmTests::SetUp() { + LoweringTests::SetUp(); + std::vector input_shapes(2); + size_t transpose_position; + std::tie(input_shapes, master_shape, transpose_position) = this->GetParam(); + + snippets_function = std::make_shared(input_shapes, transpose_position); +} + +TEST_P(FuseTransposeBrgemmTests, FuseTransposeMatmul) { + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), master_shape); + function = subgraph->body_ptr(); + function_ref = snippets_function->getLowered(); +} + +namespace FuseTransposeBrgemmTestsInstantiation { +using ov::Shape; +std::vector test_params{ + {{{1, 49, 2, 23}, {2, 2, 23, 39}}, {2, 2, 49, 23}, 0}, + {{{1, 2, 49, 23}, {2, 23, 1, 39}}, {2, 2, 49, 39}, 1}, + {{{1, 2, 49, 23}, {2, 2, 23, 39}}, {2, 2, 49, 39}, 2}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_FuseTransposeMatMul, FuseTransposeBrgemmTests, + ::testing::ValuesIn(test_params), + FuseTransposeBrgemmTests::getTestCaseName); + +} // namespace FuseTransposeBrgemmTestsInstantiation +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/tests/src/pass/insert_load_store.cpp b/src/common/snippets/tests/src/pass/insert_load_store.cpp index 1a2fa5a75fc..929697852cb 100644 --- a/src/common/snippets/tests/src/pass/insert_load_store.cpp +++ b/src/common/snippets/tests/src/pass/insert_load_store.cpp @@ -25,16 +25,18 @@ std::string InsertLoadStoreTests::getTestCaseName(testing::TestParamInfo inputShapes(3); std::vector broadcastShapes(3); std::tie(inputShapes[0], inputShapes[1], inputShapes[2], broadcastShapes[0], broadcastShapes[1], broadcastShapes[2]) = this->GetParam(); - snippets_function = std::make_shared(inputShapes, broadcastShapes); + snippets_function = std::make_shared( + std::vector {inputShapes[0], inputShapes[1], inputShapes[2]}, broadcastShapes); + master_shape = inputShapes[0]; } TEST_P(InsertLoadStoreTests, ThreeInputsEltwise) { - auto subgraph = getLoweredSubgraph(snippets_function->getOriginal()); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), master_shape); function = subgraph->body_ptr(); function_ref = snippets_function->getLowered(); } diff --git a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp index f97b8019239..669cb34bc30 100644 --- a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp @@ -24,15 +24,22 @@ std::string InsertMoveBroadcastTests::getTestCaseName(testing::TestParamInfo inputShapes(2); std::vector broadcastShapes(2); std::tie(inputShapes[0], inputShapes[1], broadcastShapes[0], broadcastShapes[1]) = this->GetParam(); - snippets_function = std::make_shared(inputShapes, broadcastShapes); + snippets_function = std::make_shared(std::vector {inputShapes[0], inputShapes[1]}, broadcastShapes); + if (inputShapes[0].size() != inputShapes[1].size()) + IE_THROW() << "Expected input shapes of the same size"; + master_shape = {}; + for (int i = 0; i < inputShapes[0].size(); i++) + master_shape.push_back(static_cast(std::max(inputShapes[0][i], inputShapes[1][i]))); } TEST_P(InsertMoveBroadcastTests, AddBroadcast) { - auto subgraph = getLoweredSubgraph(snippets_function->getOriginal()); + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); function = subgraph->body_ptr(); function_ref = snippets_function->getLowered(); } diff --git a/src/common/snippets/tests/src/pass/merge_loops.cpp b/src/common/snippets/tests/src/pass/merge_loops.cpp new file mode 100644 index 00000000000..be398f2107f --- /dev/null +++ b/src/common/snippets/tests/src/pass/merge_loops.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include + +#include +#include + +#include + +#include "common_test_utils/ngraph_test_utils.hpp" + +using namespace testing; +using namespace ngraph; + +TEST(TransformationTests, UnaryEltwisesLoops) { + std::shared_ptr f(nullptr), f_ref(nullptr); + auto shape = Shape{2, 3, 240}; + const size_t vector_size = 16; + const std::vector inner_ptr_increments(2, vector_size); + const std::vector inner_finalization_offsets(2, 0); + { + auto data = std::make_shared(element::f32, shape); + + auto outer_loop_begin_up = std::make_shared(OutputVector{data}); + auto inner_loop_begin_up = std::make_shared(OutputVector{outer_loop_begin_up}); + auto load_up = std::make_shared(inner_loop_begin_up->output(0)); + auto relu = std::make_shared(load_up); + auto store_up = std::make_shared(relu); + auto inner_loop_end_up = std::make_shared( + OutputVector{store_up, inner_loop_begin_up->output(1)}, shape[shape.size() - 1], vector_size, + inner_ptr_increments, inner_finalization_offsets); + auto outer_loop_end_up = std::make_shared( + OutputVector{inner_loop_end_up->output(0), outer_loop_begin_up->output(1)}, shape[shape.size() - 2], 1, + std::vector{0, 0}, std::vector{0, 0}); + + auto buffer = std::make_shared(outer_loop_end_up); + + auto outer_loop_begin_down = std::make_shared(OutputVector{buffer}); + auto inner_loop_begin_down = std::make_shared(OutputVector{outer_loop_begin_down}); + auto load_down = std::make_shared(inner_loop_begin_down->output(0)); + auto hswish = std::make_shared(load_down); + auto store_down = std::make_shared(hswish); + auto inner_loop_end_down = std::make_shared( + OutputVector{store_down, inner_loop_begin_down->output(1)}, shape[shape.size() - 1], vector_size, + inner_ptr_increments, inner_finalization_offsets); + auto outer_loop_end_down = std::make_shared( + OutputVector{inner_loop_end_down->output(0), outer_loop_begin_down->output(1)}, shape[shape.size() - 2], 1, + std::vector{0, 0}, std::vector{0, 0}); + + f = std::make_shared(OutputVector{outer_loop_end_down->output(0)}, ParameterVector{data}); + + pass::Manager m; + m.register_pass(); + m.register_pass(); + m.run_passes(f); + } + { + auto data = std::make_shared(element::f32, shape); + + auto outer_loop_begin = std::make_shared(OutputVector{data}); + auto inner_loop_begin = std::make_shared(OutputVector{outer_loop_begin}); + auto load = std::make_shared(inner_loop_begin->output(0)); + auto relu = std::make_shared(load); + auto hswish = std::make_shared(relu); + auto store = std::make_shared(hswish); + auto inner_loop_end = std::make_shared( + OutputVector{store, inner_loop_begin->output(1)}, shape[shape.size() - 1], vector_size, + inner_ptr_increments, inner_finalization_offsets); + auto outer_loop_end = std::make_shared( + OutputVector{inner_loop_end->output(0), outer_loop_begin->output(1)}, shape[shape.size() - 2], 1, + std::vector{0, 0}, std::vector{0, 0}); + + f_ref = std::make_shared(OutputVector{outer_loop_end->output(0)}, ParameterVector{data}); + } + + auto res = compare_functions(f, f_ref); + ASSERT_TRUE(res.first) << res.second; +} + +TEST(TransformationTests, BinaryEltwisesLoops) { + std::shared_ptr f(nullptr), f_ref(nullptr); + auto shape = Shape{2, 3, 240}; + const size_t vector_size = 16; + { + const std::vector inner_ptr_increments(3, vector_size); + const std::vector inner_finalization_offsets(3, 0); + + auto data0 = std::make_shared(element::f32, shape); + auto data1 = std::make_shared(element::f32, shape); + + auto outer_loop_begin_up = std::make_shared(OutputVector{data0, data1}); + auto inner_loop_begin_up = std::make_shared(OutputVector{outer_loop_begin_up->output(0), + outer_loop_begin_up->output(1)}); + auto load0_up = std::make_shared(inner_loop_begin_up->output(0)); + auto load1_up = std::make_shared(inner_loop_begin_up->output(1)); + auto add = std::make_shared(load0_up, load1_up); + auto relu = std::make_shared(add); + auto store_up = std::make_shared(relu); + auto inner_loop_end_up = std::make_shared( + OutputVector{store_up, inner_loop_begin_up->output(2)}, shape[shape.size() - 1], vector_size, + inner_ptr_increments, inner_finalization_offsets); + auto outer_loop_end_up = std::make_shared( + OutputVector{inner_loop_end_up->output(0), outer_loop_begin_up->output(2)}, shape[shape.size() - 2], 1, + std::vector{0, 0, 0}, std::vector{0, 0, 0}); + + auto buffer = std::make_shared(outer_loop_end_up); + + auto data2 = std::make_shared(element::f32, shape); + + auto outer_loop_begin_down = std::make_shared(OutputVector{buffer, data2}); + auto inner_loop_begin_down = std::make_shared(OutputVector{outer_loop_begin_down->output(0), + outer_loop_begin_down->output(1)}); + auto load0_down = std::make_shared(inner_loop_begin_down->output(0)); + auto load1_down = std::make_shared(inner_loop_begin_down->output(1)); + auto mul = std::make_shared(load0_down, load1_down); + auto hswish = std::make_shared(mul); + auto store_down = std::make_shared(hswish); + auto inner_loop_end_down = std::make_shared( + OutputVector{store_down, inner_loop_begin_down->output(2)}, shape[shape.size() - 1], vector_size, + inner_ptr_increments, inner_finalization_offsets); + auto outer_loop_end_down = std::make_shared( + OutputVector{inner_loop_end_down->output(0), outer_loop_begin_down->output(2)}, shape[shape.size() - 2], 1, + std::vector{0, 0, 0}, std::vector{0, 0, 0}); + + f = std::make_shared(OutputVector{outer_loop_end_down->output(0)}, ParameterVector{data0, data1, data2}); + + pass::Manager m; + m.register_pass(); + m.register_pass(); + m.run_passes(f); + } + { + const std::vector inner_ptr_increments(4, vector_size); + const std::vector inner_finalization_offsets(4, 0); + + auto data0 = std::make_shared(element::f32, shape); + auto data1 = std::make_shared(element::f32, shape); + auto data2 = std::make_shared(element::f32, shape); + + auto outer_loop_begin = std::make_shared(OutputVector{data0, data1, data2}); + auto inner_loop_begin = std::make_shared(OutputVector{outer_loop_begin->output(0), + outer_loop_begin->output(1), + outer_loop_begin->output(2)}); + auto load0 = std::make_shared(inner_loop_begin->output(0)); + auto load1 = std::make_shared(inner_loop_begin->output(1)); + auto load2 = std::make_shared(inner_loop_begin->output(2)); + auto add = std::make_shared(load0, load1); + auto relu = std::make_shared(add); + auto mul = std::make_shared(relu, load2); + auto hswish = std::make_shared(mul); + auto store = std::make_shared(hswish); + auto inner_loop_end = std::make_shared( + OutputVector{store, inner_loop_begin->output(3)}, shape[shape.size() - 1], vector_size, + inner_ptr_increments, inner_finalization_offsets); + auto outer_loop_end = std::make_shared( + OutputVector{inner_loop_end->output(0), outer_loop_begin->output(3)}, shape[shape.size() - 2], 1, + std::vector{0, 0, 0, 0}, std::vector{0, 0, 0, 0}); + + f_ref = std::make_shared(OutputVector{outer_loop_end->output(0)}, ParameterVector{data0, data1, data2}); + } + + auto res = compare_functions(f, f_ref); + ASSERT_TRUE(res.first) << res.second; +} diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp new file mode 100644 index 00000000000..4c3d967be5f --- /dev/null +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include "snippets/pass/tokenization.hpp" +#include "snippets/pass/explicit_transpose_matmul_inputs.hpp" + +namespace ov { +namespace test { +namespace snippets { + +void TokenizeMHASnippetsTests::run() { + ASSERT_TRUE(function); + std::string name; + manager.register_pass(); + manager.register_pass(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA) { + const auto &f = MHAFunction(std::vector{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}); + function = f.getOriginal(); + function_ref = f.getReference(); + run(); +} + +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_MatMul0_Transpose) { + const auto &f = MHAMatMul0TransposeFunction(std::vector{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}); + function = f.getOriginal(); + function_ref = f.getReference(); + run(); +} + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/tests/src/pass/softmax_decomposition.cpp b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp new file mode 100644 index 00000000000..e3330bd69de --- /dev/null +++ b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp @@ -0,0 +1,122 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "pass/softmax_decomposition.hpp" +#include "common_test_utils/common_utils.hpp" +#include "subgraph_softmax.hpp" +#include "subgraph_lowered.hpp" + +#include "snippets/pass/softmax_decomposition.hpp" +#include "snippets/pass/insert_load_store.hpp" +#include "snippets/pass/insert_movebroadcast.hpp" +#include "snippets/pass/insert_buffer.hpp" +#include "snippets/pass/convert_power_to_powerstatic.hpp" + + +namespace ov { +namespace test { +namespace snippets { + +std::string SoftmaxTests::getTestCaseName(testing::TestParamInfo obj) { + Shape inputShape; + int axis; + std::tie(inputShape, axis) = obj.param; + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_"; + result << "Axis=" << axis << "_"; + return result.str(); +} + +void SoftmaxTests::SetUp() { + LoweringTests::SetUp(); + + const size_t count = 10; + manager.register_pass(count); + manager.register_pass(); + manager.register_pass(count); + manager.register_pass(count); + manager.register_pass(); + Shape inputShape; + int axis; + std::tie(inputShape, axis) = this->GetParam(); + snippets_function = std::make_shared(std::vector{inputShape}, axis); + master_shape = inputShape; +} + +std::string AddSoftmaxTests::getTestCaseName(testing::TestParamInfo obj) { + Shape inputShape0, inputShape1; + int axis; + std::tie(inputShape0, inputShape1, axis) = obj.param; + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShape0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShape1) << "_"; + result << "Axis=" << axis << "_"; + return result.str(); +} + +void AddSoftmaxTests::SetUp() { + LoweringTests::SetUp(); + + const size_t count = 10; + manager.register_pass(); + manager.register_pass(count); + manager.register_pass(); + manager.register_pass(count); + manager.register_pass(count); + manager.register_pass(); + Shape inputShape0, inputShape1; + int axis; + std::tie(inputShape0, inputShape1, axis) = this->GetParam(); + snippets_function = std::make_shared(std::vector{inputShape0, inputShape1}, axis); + + ov::PartialShape master_pshape(inputShape0); + ov::PartialShape::broadcast_merge_into(master_pshape, inputShape1, op::AutoBroadcastType::NUMPY); + master_shape = master_pshape.get_shape(); +} + +TEST_P(SoftmaxTests, SoftmaxDecomposition) { + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); + function = subgraph->body_ptr(); + function_ref = snippets_function->getLowered(); +} + +TEST_P(AddSoftmaxTests, AddSoftmaxDecomposition) { + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); + function = subgraph->body_ptr(); + function_ref = snippets_function->getLowered(); +} + +namespace SoftmaxTestsInstantiation { +std::vector inputShape{{12, 4, 12, 12, 127}, {12, 4, 12, 12, 1}}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SoftmaxDecomposition, SoftmaxTests, + ::testing::Combine( + ::testing::ValuesIn(inputShape), + ::testing::Values(-1)), + SoftmaxTests::getTestCaseName); + +} // namespace SoftmaxTestsInstantiation + +namespace AddSoftmaxTestsInstantiation { +std::vector inputShape0{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}}; +std::vector inputShape1{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmaxDecomposition, AddSoftmaxTests, + ::testing::Combine( + ::testing::ValuesIn(inputShape0), + ::testing::ValuesIn(inputShape1), + ::testing::Values(-1)), + AddSoftmaxTests::getTestCaseName); + +} // namespace AddSoftmaxTestsInstantiation + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp b/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp new file mode 100644 index 00000000000..f8b51924a02 --- /dev/null +++ b/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include + +#include +#include + +#include + +#include "common_test_utils/ngraph_test_utils.hpp" + +using namespace testing; +using namespace ngraph; + +TEST_F(TransformationTestsF, SoftmaxV1ReshapeElimination) { + { + auto data = std::make_shared(element::f32, Shape{2, 3, 240}); + auto shape0 = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{6, 240}); + auto reshape0 = std::make_shared(data, shape0, false); + auto softmax_v1 = std::make_shared(reshape0, 1); + auto shape1 = std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{2, 3, 240}); + auto reshape1 = std::make_shared(softmax_v1, shape1, false); + function = std::make_shared(NodeVector{reshape1}, ParameterVector{data}); + + manager.register_pass(); + } + { + auto data = std::make_shared(element::f32, Shape{2, 3, 240}); + auto softmax_v1 = std::make_shared(data, 2); + function_ref = std::make_shared(NodeVector{softmax_v1}, ParameterVector{data}); + } +} + +TEST_F(TransformationTestsF, SoftmaxV8ReshapeElimination) { + { + auto data = std::make_shared(element::f32, Shape{1, 2, 340, 240}); + auto shape0 = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{680, 240}); + auto reshape0 = std::make_shared(data, shape0, false); + auto softmax_v1 = std::make_shared(reshape0, -1); + auto shape1 = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 2, 340, 240}); + auto reshape1 = std::make_shared(softmax_v1, shape1, false); + function = std::make_shared(NodeVector{reshape1}, ParameterVector{data}); + + manager.register_pass(); + } + { + auto data = std::make_shared(element::f32, Shape{1, 2, 340, 240}); + auto softmax_v1 = std::make_shared(data, 3); + function_ref = std::make_shared(NodeVector{softmax_v1}, ParameterVector{data}); + } +} + +TEST_F(TransformationTestsF, SoftmaxReshapeElimination_IncorrectReshape) { + { + auto data = std::make_shared(element::f32, Shape{1, 2, 340, 240}); + auto shape0 = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{2, 81600}); + auto reshape0 = std::make_shared(data, shape0, false); + auto softmax_v1 = std::make_shared(reshape0, -1); + auto shape1 = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 2, 340, 240}); + auto reshape1 = std::make_shared(softmax_v1, shape1, false); + function = std::make_shared(NodeVector{reshape1}, ParameterVector{data}); + + manager.register_pass(); + } +} diff --git a/src/common/snippets/tests/src/registers.cpp b/src/common/snippets/tests/src/registers.cpp index 677a318dfab..41b94db5e3f 100644 --- a/src/common/snippets/tests/src/registers.cpp +++ b/src/common/snippets/tests/src/registers.cpp @@ -33,6 +33,8 @@ TEST(TransformationTests, AssignRegisters) { auto s00 = std::make_shared(y02); s00->set_friendly_name("y03"); s00->set_friendly_name("s00"); f = std::make_shared(NodeVector{s00}, ParameterVector{p0, p1}); + // Note that testing the result is not strictly necessary, since the Result doesn't emit any code + f->get_result()->set_friendly_name("r00"); pass::Manager m; m.register_pass(); @@ -52,18 +54,19 @@ TEST(TransformationTests, AssignRegisters) { {"y01", 1}, {"y02", 2}, {"s00", 2}, // gpr + {"r00", 2} // gpr }; auto total_ops = 0; for (auto& op : f->get_ordered_ops()) { - auto& rt = op->get_rt_info(); - - auto it_rinfo = rt.find("reginfo"); - if (it_rinfo != rt.end()) { - auto reginfo = it_rinfo->second.as>(); - auto reg = reginfo[0]; - ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg); - total_ops++; + for (const auto& output : op->outputs()) { + const auto& rt = output.get_tensor_ptr()->get_rt_info(); + auto it_rt = rt.find("reginfo"); + if (it_rt != rt.end()) { + auto reg = it_rt->second.as(); + ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg); + total_ops++; + } } } ASSERT_EQ(total_ops, ref_registers.size()); @@ -120,6 +123,7 @@ TEST(TransformationTests, AssignRegisters2) { s00->set_friendly_name("s00"); f = std::make_shared(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7}); + f->get_result()->set_friendly_name("res00"); pass::Manager m; m.register_pass(); @@ -140,17 +144,19 @@ TEST(TransformationTests, AssignRegisters2) { {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6}, {"r24", 1}, {"s00", 8}, + {"res00", 8} }; auto total_ops = 0; for (auto& op : f->get_ordered_ops()) { - auto& rt = op->get_rt_info(); - auto it_rinfo = rt.find("reginfo"); - if (it_rinfo != rt.end()) { - auto reginfo = it_rinfo->second.as>(); - auto reg = reginfo[0]; - ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg); - total_ops++; + for (const auto& output : op->outputs()) { + const auto& rt = output.get_tensor_ptr()->get_rt_info(); + auto it_rt = rt.find("reginfo"); + if (it_rt != rt.end()) { + auto reg = it_rt->second.as(); + ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg); + total_ops++; + } } } ASSERT_EQ(total_ops, ref_registers.size()); diff --git a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp index 67a557298ed..45e4876998a 100644 --- a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp +++ b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp @@ -653,7 +653,7 @@ NGRAPH_TEST(${BACKEND_NAME}, onnx_model_attention_qkv_hidden_sizes) { test_case.add_input(input); test_case.add_expected_output(output); - test_case.run_with_tolerance_as_fp(1e-6); + test_case.run_with_tolerance_as_fp(1e-4); } NGRAPH_TEST(${BACKEND_NAME}, onnx_model_attention_unidirectional) { diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp index e112fe687ad..294ab6a18e0 100644 --- a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp +++ b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp @@ -108,6 +108,19 @@ DECLARE_CONFIG_KEY(CONFIG_DEVICE_ID); * @brief enable hyper thread */ DECLARE_CONFIG_KEY(ENABLE_HYPER_THREAD); + +/** + * @brief Defines Snippets tokenization mode + * @param ENABLE - default pipeline + * @param IGNORE_CALLBACK - disable the Snippets markup transformation and tokenization callback + * @param DISABLE - turn off the Snippets + * @ingroup ie_dev_api_plugin_api + */ +DECLARE_CONFIG_KEY(SNIPPETS_MODE); +DECLARE_CONFIG_VALUE(ENABLE); +DECLARE_CONFIG_VALUE(IGNORE_CALLBACK); +DECLARE_CONFIG_VALUE(DISABLE); + } // namespace PluginConfigInternalParams } // namespace InferenceEngine diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 92f498876e3..b7a52b2b21c 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -188,6 +188,16 @@ void Config::readProperties(const std::map &prop) { IE_THROW() << "Wrong value for property key " << CPUConfigParams::KEY_CPU_DENORMALS_OPTIMIZATION << ". Expected only YES/NO"; } + } else if (key == PluginConfigInternalParams::KEY_SNIPPETS_MODE) { + if (val == PluginConfigInternalParams::ENABLE) + snippetsMode = SnippetsMode::Enable; + else if (val == PluginConfigInternalParams::IGNORE_CALLBACK) + snippetsMode = SnippetsMode::IgnoreCallback; + else if (val == PluginConfigInternalParams::DISABLE) + snippetsMode = SnippetsMode::Disable; + else + IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_SNIPPETS_MODE + << ". Expected values: ENABLE/DISABLE/IGNORE_CALLBACK"; } else { IE_THROW(NotFound) << "Unsupported property " << key << " by CPU plugin"; } diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 8a1dcc59b83..012fd0fd9f0 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -32,9 +32,16 @@ struct Config { DO_On, }; + enum SnippetsMode { + Enable, + IgnoreCallback, + Disable, + }; + bool collectPerfCounters = false; bool exclusiveAsyncRequests = false; bool enableDynamicBatch = false; + SnippetsMode snippetsMode = SnippetsMode::Enable; std::string dumpToDot = ""; int batchLimit = 0; float fcSparseWeiDecompressionRate = 1.0f; diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index bba788545f2..fb3f12a9761 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -17,6 +17,7 @@ #include "snippets_transformations/op/load_convert.hpp" #include "snippets_transformations/op/store_convert.hpp" +#include "snippets/op/brgemm.hpp" #include "ngraph_transformations/op/swish_cpu.hpp" #include @@ -45,9 +46,12 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // data movement jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter); jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter); + jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(VectorBufferEmitter); // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); + jitters[ngraph::snippets::op::LoadReshape::get_type_info_static()] = CREATE_EMITTER(LoadEmitter); jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_EMITTER(BroadcastLoadEmitter); jitters[ov::intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter); jitters[ov::intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter); @@ -65,6 +69,9 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ jitters[ngraph::snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_saturation_emitter); // jitters[ngraph::opset1::FakeQuantize::get_type_info_static()] = CREATE_EMITTER(); // not supported + // ternary + jitters[ngraph::opset1::Select::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_select_emitter); + // binary jitters[ngraph::opset1::Add::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_add_emitter); jitters[ngraph::opset1::Divide::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_divide_emitter); @@ -121,10 +128,15 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ // jitters[ngraph::opset1::Selu::get_type_info_static()] = CREATE_EMITTER(); // not supported jitters[ngraph::op::v0::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v0_emitter); jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter); + jitters[ngraph::snippets::op::Fill::get_type_info_static()] = CREATE_EMITTER(FillEmitter); + + jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = CREATE_EMITTER(HorizonMaxEmitter); + jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = CREATE_EMITTER(HorizonSumEmitter); jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter); - jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter); - jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter); + jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = CREATE_EMITTER(LoopBeginEmitter); + jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = CREATE_EMITTER(LoopEndEmitter); + jitters[ngraph::snippets::op::Brgemm::get_type_info_static()] = CREATE_EMITTER(BrgemmEmitter); } size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const { @@ -141,7 +153,9 @@ bool ov::intel_cpu::CPUTargetMachine::is_supported() const { } code ov::intel_cpu::CPUTargetMachine::get_snippet() const { - h->create_kernel(); + if (h->create_kernel() != status::success) { + IE_THROW() << "Failed to create jit_kernel in get_snippet()"; + } return h->jit_ker(); } diff --git a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp index 506b77603e4..143a78ef173 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp @@ -2143,5 +2143,66 @@ void jit_is_nan_emitter::register_table_entries() { } } +/// SELECT /// +jit_select_emitter::jit_select_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) {} +jit_select_emitter::jit_select_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} + +size_t jit_select_emitter::get_inputs_num() const { return 3; } + +size_t jit_select_emitter::aux_vecs_count() const { + if (host_isa_ == x64::avx512_core) + return 0; + else if (host_isa_ == x64::avx2) // tmp vec for mask + return 1; + else // mask should be xmm0 on sse41 + tmp vec for mask + return 2; +} + +void jit_select_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const { + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_cond = Vmm(in_vec_idxs[0]); + Vmm vmm_src0 = Vmm(in_vec_idxs[1]); + Vmm vmm_src1 = Vmm(in_vec_idxs[2]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + if (isa == x64::sse41) { + Vmm vmm_mask = Vmm(aux_vec_idxs[0]); + Vmm vmm_zero = Vmm(aux_vec_idxs[1]); + h->uni_vpxor(vmm_zero, vmm_zero, vmm_zero); + h->uni_vcmpps(vmm_cond, vmm_cond, vmm_zero, 0x4); + if (vmm_mask.getIdx() != vmm_cond.getIdx()) { + h->uni_vmovups(vmm_mask, vmm_cond); + } + if (vmm_src1.getIdx() != vmm_dst.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src1); + } + h->uni_vblendvps(vmm_dst, vmm_dst, vmm_src0, vmm_mask); + } else if (isa == x64::avx2) { + Vmm vmm_zero = Vmm(aux_vec_idxs[0]); + h->uni_vpxor(vmm_zero, vmm_zero, vmm_zero); + h->uni_vcmpps(vmm_cond, vmm_cond, vmm_zero, 0x4); + h->uni_vblendvps(vmm_dst, vmm_src1, vmm_src0, vmm_cond); + } else { + h->vptestmd(k_mask, vmm_cond, vmm_cond); + h->vblendmps(vmm_dst | k_mask, vmm_src1, vmm_src0); + } +} } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp index b8059793859..83a042633df 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp @@ -692,5 +692,23 @@ private: void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; }; +class jit_select_emitter : public jit_emitter { +public: + jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() const override; + size_t aux_vecs_count() const override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index 2130457847f..728c5de139b 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -7,8 +7,12 @@ #include #include "jit_snippets_emitters.hpp" +#include "snippets/op/brgemm.hpp" +#include "snippets/op/subgraph.hpp" +#include "snippets/utils.hpp" using namespace Xbyak; +using ngraph::snippets::op::Subgraph; namespace ov { namespace intel_cpu { @@ -23,57 +27,71 @@ jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -void jit_container_emitter::map_abstract_registers(const std::vector &vec_pool, const std::vector &gpr_pool, - std::set& vecs_used, std::set& gprs_used) { - if (body.empty()) - IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty"; - auto abstract_to_physical = [](const std::vector& abstract_regs, const std::vector& regs_pool) { +void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, + std::vector& allocated_emitters) const { + if (allocated_emitters.empty()) + IE_THROW() << "Cannot map registers when there is no allocated_emitters provided"; + auto map_regs = [](const std::vector& abstract_regs, mapping_info& mapping) { + auto& abstract_to_physical = mapping.first; + auto& regs_pool = mapping.second; std::vector physical_regs(abstract_regs.size()); - for (size_t i = 0; i < abstract_regs.size(); i++) - physical_regs[i] = regs_pool.at(abstract_regs[i]); + for (size_t i = 0; i < abstract_regs.size(); i++) { + const auto abstract = abstract_regs[i]; + auto& physical = physical_regs[i]; + if (abstract_to_physical.count(abstract) == 0) { + if (regs_pool.empty()) + IE_THROW() << "Cannot map registers for jit_container_emitter: not enough regs in the pool"; + physical = regs_pool.back(); + regs_pool.pop_back(); + abstract_to_physical[abstract] = physical; + } else { + physical = abstract_to_physical[abstract]; + } + } return physical_regs; }; - for (auto& code : body) { + + for (auto& code : allocated_emitters) { const auto& emitter = code.first; std::vector in_abstract_regs, out_abstract_regs; std::tie(in_abstract_regs, out_abstract_regs) = code.second; std::vector in_physical_regs, out_physical_regs; switch (std::dynamic_pointer_cast(emitter)->get_in_out_type()) { case gpr_to_gpr: - // Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile. + // Note that gpr_to_gpr is used for high-level utility operations like Kernel/Loop. // Input registers are not mapped in this case, since they contain utility info - // (num_params, tile increment, etc.), but not reg indexes. - in_physical_regs = std::move(in_abstract_regs); - out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool)); - gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + // (num_params, loop increment, etc.), but not reg indexes. + // todo: Note that LoopBeginEmitter and LoopEndEmitter demonstrate new paradigm, + // where all utility emitters align with conventional Op emitters + if (std::dynamic_pointer_cast(emitter) || + std::dynamic_pointer_cast(emitter) || + std::dynamic_pointer_cast(emitter)) + in_physical_regs = std::move(map_regs(in_abstract_regs, gpr_map_pool)); + else + in_physical_regs = std::move(in_abstract_regs); + out_physical_regs = std::move(map_regs(out_abstract_regs, gpr_map_pool)); break; case gpr_to_vec: // Load Emitters - in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool)); - out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool)); - gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); - vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + in_physical_regs = std::move(map_regs(in_abstract_regs, gpr_map_pool)); + out_physical_regs = std::move(map_regs(out_abstract_regs, vec_map_pool)); break; case vec_to_gpr: // Store Emitters - in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool)); - out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool)); - vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); - gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + in_physical_regs = std::move(map_regs(in_abstract_regs, vec_map_pool)); + out_physical_regs = std::move(map_regs(out_abstract_regs, gpr_map_pool)); break; case vec_to_vec: // Regular operations - in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool)); - out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool)); - vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); - vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + in_physical_regs = std::move(map_regs(in_abstract_regs, vec_map_pool)); + out_physical_regs = std::move(map_regs(out_abstract_regs, vec_map_pool)); break; default: IE_THROW() << "Unhandled in_out type"; } code.second = std::make_pair(in_physical_regs, out_physical_regs); if (auto container = std::dynamic_pointer_cast(code.first)) - container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used); + container->map_abstract_registers(gpr_map_pool, vec_map_pool, allocated_emitters); } } @@ -84,15 +102,66 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: IE_THROW() << "KernelEmitter invoked with invalid op argument"; if (kernel->region.empty()) IE_THROW() << "KernelEmitter invoked with empty body"; + if (kernel->compile_params == nullptr) + IE_THROW() << "KernelEmitter invoked with op::Kernel that contains no compile_params"; body = kernel->region; - if (!kernel->compile_params) - IE_THROW() << "KernelEmitter invoked without compile_params"; jcp = *reinterpret_cast(kernel->compile_params); + // calc data access pattern. we'll need it for offsets calculation + const auto& model = kernel->model; + const auto get_static_shape = [](const std::shared_ptr& node) { + const auto& pshape = node->get_output_partial_shape(0); + if (pshape.is_dynamic()) + IE_THROW() << "KernelEmitter can't calc offsets for dynamic shapes"; + return pshape.get_shape(); + }; + const auto get_data_layout = [](const Output& out, std::vector& shape) { + const auto& layout = ngraph::snippets::utils::get_node_output_layout(out.get_node_shared_ptr()); + // default access pattern + if (!layout.empty()) { + const auto layout_shape_diff = static_cast(shape.size()) - static_cast(layout.size()); + // Plugin can (and usually does) prepend shapes with 1's to facilitate scheduling, here we can safely remove leading 1's + if (layout_shape_diff > 0) { + if (std::any_of(shape.begin(), shape.begin() + layout_shape_diff, [](size_t x){return x != 1;})) + IE_THROW() << "KernelEmitter detected shape vs access pattern conflict: only leading 1's can be removed from the shape"; + shape.erase(shape.begin(), shape.begin() + layout_shape_diff); + } + } + return layout; + }; + const auto& ops = model->get_ordered_ops(); + auto params = model->get_parameters(); + auto results = model->get_results(); + num_inputs = params.size(); + num_outputs = results.size(); + is_buffer_needed = std::any_of(ops.begin(), ops.end(), + [](const std::shared_ptr& node) { return ov::is_type(node); } ); + NodeVector io_nodes; + std::copy(params.begin(), params.end(), std::back_inserter(io_nodes)); + std::copy(results.begin(), results.end(), std::back_inserter(io_nodes)); + + const auto& model_rt_info = model->get_rt_info(); + const auto& plugin_shapes = model_rt_info.find("PluginShapesOverride"); + if (plugin_shapes == model_rt_info.end()) { + IE_THROW() << "JIT KernelEmitter requires plugin-overriden shapes in model rt_info"; + } else { + const auto& new_shapes = plugin_shapes->second.as>>(); + if (new_shapes.size() != num_inputs + num_outputs) + IE_THROW() << "JIT KernelEmitter detected invalid plugin-overriden shapes"; + io_shapes = new_shapes; + } + for (int i = 0; i < io_nodes.size(); i++) { + const auto& out = i < num_inputs ? io_nodes[i]->output(0) : io_nodes[i]->input_value(0); + data_layout.push_back(get_data_layout(out, io_shapes[i])); + io_data_size.push_back(out.get_element_type().size()); + } // Initialize pools of gp and vec registers gp_regs_pool.resize(16); vec_regs_pool.resize(16); - std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0); - std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0); + // It's easier to remove the last item during mapping, so fill descending to map ascending + for (size_t i = 0; i < 16; i++) + gp_regs_pool[i] = vec_regs_pool[i] = 15 - i; + // todo: it's more convenient to use std::set as a pool container (unique and always sorted), + // but pools are vectors to align with emit_code signature. Change signature? auto remove_regs_from_pool = [](std::vector& pool, const std::set& to_remove) { // It's important to keep the order of other elements pool.erase(std::remove_if(pool.begin(), pool.end(), @@ -101,14 +170,31 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: // Reserve stack base and pointer for push(...) and pop(...) operations // Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP, - static_cast(abi_param1.getIdx()), - static_cast(abi_param2.getIdx())}); - std::set vecs_used, gprs_used; - map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used); - remove_regs_from_pool(gp_regs_pool, gprs_used); - remove_regs_from_pool(vec_regs_pool, vecs_used); - // Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs - gp_regs_used = std::vector(gprs_used.begin(), gprs_used.end()); + reg_indexes_idx, reg_const_params_idx}); + + mapping_info gpr_map_pool({}, gp_regs_pool); + mapping_info vec_map_pool({}, vec_regs_pool); + std::vector data_io_emitters; + std::copy_if(body.begin(), body.end(), std::back_inserter(data_io_emitters), + [](const AllocatedEmitter& code){ + const auto& emitter = code.first; + const auto emitter_type = std::dynamic_pointer_cast(emitter)->get_in_out_type(); + // todo: how this will be handled if Brgemm in & out are op::Buffer + // Brgemm is a special case since it incorporates input and output (we use onednn kernel) + // Just like Load & Store it requires offsets calculation + const auto is_brgemm = std::dynamic_pointer_cast(emitter) != nullptr; + return emitter_type == gpr_to_vec || emitter_type == vec_to_gpr || is_brgemm; + }); + // Note that we can't use reg_indexes_idx or reg_const_params_idx to store data pointers because these two + // regs are used to calculate offsets for the data pointers + map_abstract_registers(gpr_map_pool, vec_map_pool, data_io_emitters); + for (const auto& abstract_to_physical : gpr_map_pool.first) + data_ptr_regs_idx.push_back(abstract_to_physical.second); + // However we can use reg_indexes_idx and reg_const_params_idx for other operations since we won't need them + // after offsets calculation + gpr_map_pool.second.push_back(reg_indexes_idx); + gpr_map_pool.second.push_back(reg_const_params_idx); + map_abstract_registers(gpr_map_pool, vec_map_pool, body); } void KernelEmitter::emit_code(const std::vector &in, @@ -123,268 +209,259 @@ void KernelEmitter::validate_arguments(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const { - if (in.size() != 2) - IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size(); + if (!in.empty()) + IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 0, got " << in.size(); if (!out.empty()) - IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); + IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); + const auto num_params = num_inputs + num_outputs + static_cast(is_buffer_needed); + // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount + if (data_ptr_regs_idx.size() != num_params) + IE_THROW() << "KernelEmitter: number of inputs and outputs is inconsisnent with the number of allocated registers" + << num_params << " data_ptr_regs_idx.size() = " << data_ptr_regs_idx.size(); } -void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, - const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { - const int64_t harness_num_dims = jcp.output_dims.size() - 1; - auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) { - for (int j = 0; j < harness_num_dims; j++) { - if (jcp.output_dims[j] != 1 && offsets[j] != 0) { +void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, bool is_buffer_needed, + const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { + // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter + const size_t offset_rank = jcp.master_shape.size() - 1; + //const size_t tile_rank = jcp.tile_rank; + std::vector> data_offsets(num_params, std::vector{}); + auto offset_calculation = [=](const std::vector& shape, + const std::vector& layout, const size_t data_size) { + // Strides represent distance between consecutive elements of corresponding dimension. + // If a dim size == 1, then the next dim starts immediately and the stride is 0 + // case 1: + // shape: s0, s1, s2, s3 + // strides: s1*s2*s3, s2*s3, s3, 1 + // case 2: + // shape: s0, s1, s2 == 1, s3 + // strides: s1*s3, s3, 0, 1 + std::vector strides(shape.size()); + size_t dim_step = 1; + strides[shape.size() - 1] = 1; + for (int k = static_cast(shape.size()) - 2; k >= 0; k--) { + dim_step *= shape[k+1]; + strides[k] = shape[k] != 1 ? dim_step * data_size : 0; + } + // Note: this is an extra copy, but let's keep it for clarity + if (!layout.empty()) { + std::vector reordered_strides(strides.size()); + for (auto i = 0; i < layout.size(); i++) + reordered_strides[i] = strides[layout[i]]; + strides = std::move(reordered_strides); + } + // the last stride is ignored, since the entire last dim is processed by kernel + // and no parallel_for data_ptr offsets can be applied in this case (cover tile_rank == 1) + strides.pop_back(); + // if tile_rank > 1, then zero corresponding strides since no external offset can be applied + // for (auto j = 0; j < tile_rank - 1; j++) + // strides[strides.size() - 1 - j] = 0; + // actual offset size might be larger that the shape size due to 6D scheduling + strides.insert(strides.begin(), offset_rank - strides.size(), 0); + + return strides; + }; + for (size_t i = 0; i < num_params; i++) { + data_offsets[i] = offset_calculation(io_shapes[i], data_layout[i], io_data_size[i]); + } + // master_shape size must be valid in both static and dynamic cases + std::function&, Reg64)> init_ptr_with_offset; + init_ptr_with_offset = [&](Reg64 pointer, const std::vector& offsets, Reg64 reg_tmp) { + for (int j = 0; j < offset_rank; j++) { + if (jcp.master_shape[j] != 1 && offsets[j] != 0) { h->mov(reg_tmp, offsets[j]); h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]); h->add(pointer, reg_tmp); } } }; - for (auto i = 0; i < num_params; i++) { + const auto spare_corruptable_gpr = std::find_if(gp_regs_pool.begin(), gp_regs_pool.end(), + [this](size_t reg) { + return reg != reg_indexes_idx && reg != reg_const_params_idx; + }); + const bool last_iter_explicitly = spare_corruptable_gpr == gp_regs_pool.end(); + Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs[num_params - 1] : Reg64(static_cast(*spare_corruptable_gpr)); + // Vector "data_ptr_regs" is sorted by abstract regs. + // It means that the vector contains the physical registers in order [src, .., src, dst, .., dst, buffer] + // So we can initialize buffer register firstly as last value of vector "data_ptr_regs" + if (is_buffer_needed) { + h->mov(data_ptr_regs[num_params], h->ptr[reg_const_params + GET_OFF(buffer_scratchpad_ptr)]); + } + size_t i = 0; + for (; i < num_params - last_iter_explicitly; i++) { if (i < num_inputs) h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]); else h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); - // we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then - Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params; - init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp); + init_ptr_with_offset(data_ptr_regs[i], data_offsets[i], reg_tmp); + } + // a rare case when num_params is maximal, so we have no spare gprs + // * Static case: we can use reg_const_params as the last reg_tmp for the last iteration (and corrupt it), since + // it won't be used anymore + // * Dynamic case: we will need reg_const_params to pass runtime args to LoopScheduler, so we have to + // push a reg on the stack, and restore it value afterwards + if (last_iter_explicitly) { + h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); + reg_tmp = reg_const_params; + // can corrupt reg_const_params, since we won't use it anymore + init_ptr_with_offset(data_ptr_regs[i], data_offsets[i], reg_tmp); } } void KernelEmitter::emit_impl(const std::vector& in, const std::vector& out, - const std::vector& allocated_vec_regs, - const std::vector& allocated_gp_regs, + const std::vector& vec_pool, + const std::vector& gpr_pool, const ov::intel_cpu::emitter_context *emit_context) const { h->preamble(); - const size_t num_inputs = in[0]; - const size_t num_outputs = in[1]; - - Reg64 reg_indexes = Reg64(abi_param1.getIdx()); - Reg64 reg_const_params = Reg64(abi_param2.getIdx()); + Reg64 reg_indexes = Reg64(static_cast(reg_indexes_idx)); + Reg64 reg_const_params = Reg64(static_cast(reg_const_params_idx)); std::vector data_ptr_regs; - transform_idxs_to_regs(gp_regs_used, data_ptr_regs); + transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs); - init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs); - // todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool. - // we need a more elegant approach to avoid a full copy here - auto local_gpr_pool = gp_regs_pool; - local_gpr_pool.push_back(static_cast(reg_indexes.getIdx())); - local_gpr_pool.push_back(static_cast(reg_const_params.getIdx())); + init_data_pointers(num_inputs, num_inputs + num_outputs, is_buffer_needed, reg_indexes, reg_const_params, data_ptr_regs); for (const auto& c : body) { const auto& emitter = c.first; std::vector in_regs, out_regs; std::tie(in_regs, out_regs) = c.second; - if (auto tile_scheduler = std::dynamic_pointer_cast(emitter)) - out_regs = gp_regs_used; - emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool); + emitter->emit_code(in_regs, out_regs, vec_regs_pool, gp_regs_pool); } h->postamble(); } -TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { - const auto tile_scheduler = ov::as_type_ptr(n); - if (!tile_scheduler) - IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument"; - if (!tile_scheduler->compile_params) - IE_THROW() << "TileEmitter invoked without compile_params"; - body = {tile_scheduler->vector_region, tile_scheduler->scalar_region}; - jcp = *reinterpret_cast(tile_scheduler->compile_params); + +LoopBeginEmitter::LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + loop_begin = ov::as_type_ptr(n); + if (!loop_begin) + IE_THROW() << "LoopBeginEmitter invoked with invalid op argument"; + const auto& target_inputs = loop_begin->output(loop_begin->get_output_size() - 1).get_target_inputs(); + // todo: this check could be excessive, since we check for it in validate_and_infer_types() + if (target_inputs.size() != 1) + IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must have exactly one input attached"; + const auto loop_end = ov::as_type_ptr(target_inputs.begin()->get_node()->shared_from_this()); + if (!loop_end) + IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must be LoopEnd"; + work_amount = loop_begin->get_work_amount(); + evaluate_once = loop_begin->get_evaluate_once(); + num_inputs = loop_begin->get_input_size(); + in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -void TileSchedulerEmitter::emit_code(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { + +void LoopBeginEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { validate_arguments(in, out, pool, gpr); emit_impl(in, out, pool, gpr, nullptr); } -void TileSchedulerEmitter::validate_arguments(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - if (in.size() != 3) - IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size(); - if (out.size() != in[0] + in[1]) - IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size(); - if (body.size() != 2) - IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size(); - if (!(std::dynamic_pointer_cast(body[0].first) && std::dynamic_pointer_cast(body[1].first))) - IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body"; + +void LoopBeginEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (in.size() != num_inputs) + IE_THROW() << "Invalid inputs size: expected " << num_inputs << " got " << in.size(); + if (out.size() != num_inputs + 1) + IE_THROW() << "Invalid outputs size: expected " << num_inputs + 1 << " got " << out.size(); } -void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, const std::vector& data_ptr_regs, size_t vector_size, - const std::vector& vec_pool, const std::vector& gpr_pool) const { - // TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times - using TileAllocatedEmitter = std::pair, const ngraph::snippets::RegInfo&>; - TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast(body[0].first), body[0].second}; - TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast(body[1].first), body[1].second}; - const size_t inner_work_amount = jcp.scheduler_dims[1]; - auto process_tile = - [&](const bool evaluate_once, const TileAllocatedEmitter& tile) { - // If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks - if (evaluate_once) { - tile.first->emit_body(vec_pool, gpr_pool); - } else { - std::vector in_regs, out_regs; - std::tie(in_regs, out_regs) = tile.second; - // pass work_amount reg to Tile - in_regs.push_back(static_cast(reg_inner_amount.getIdx())); - for (const auto& reg : data_ptr_regs) - out_regs.emplace_back(reg.getIdx()); - tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool); - } - }; - // todo: these optimizations should be performed on using Tile graph representation in the future - bool vector_evaluate_once = false; - if (inner_work_amount >= vector_size) { - vector_evaluate_once = inner_work_amount < 2 * vector_size; - // Need to set proper work amount for inner tiles if evaluated multiple times - if (!vector_evaluate_once) - h->mov(reg_inner_amount, inner_work_amount); - process_tile(vector_evaluate_once, vector_tile); - } - if (inner_work_amount % vector_size >= 1) { - bool scalar_evaluate_once = inner_work_amount % vector_size < 2; - if (!scalar_evaluate_once) { - // vector_tile is not executed, work_amount is not set - if (inner_work_amount < vector_size) { - h->mov(reg_inner_amount, inner_work_amount); - // vector_tile is executed, but work_amount is neither set nor decremented appropriately. - } else if (vector_evaluate_once) { - vector_tile.first->emit_ptr_increments(data_ptr_regs); - h->mov(reg_inner_amount, inner_work_amount - vector_size); - } - // else: vector_tile is executed multiple times, so work_amount is already set - } else { - if (vector_evaluate_once) { - vector_tile.first->emit_ptr_increments(data_ptr_regs); - } - } - process_tile(scalar_evaluate_once, scalar_tile); +void LoopBeginEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + // todo: In dynamic case we will also need to set broadcasting info here + Reg64 reg_work_amount = Reg64(out.back()); + Label for_body; + // save previous register state (if there is an outer loop that uses this reg for example) + if (!evaluate_once) { + h->mov(reg_work_amount, work_amount); } + // Note: loop address is not calculated at this point, so need to call calcJmpAddress() which is protected + // or ready(), but they both set internal flags and that's not a desired way to use them. + // So the most obvious WA is just to use current address manually + loop_begin->begin_address = h->getCurr(); + loop_begin->input_regs = in; } -void TileSchedulerEmitter::emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& vec_pool, - const std::vector& gpr_pool, - const ov::intel_cpu::emitter_context *emit_context) const { - const size_t num_inputs = in[0]; - const size_t num_outputs = in[1]; - const size_t vector_size = in[2]; - const size_t num_params = num_inputs + num_outputs; - const auto& data_ptr_reg_idxs(out); +LoopEndEmitter::LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + loop_end = ov::as_type_ptr(n); + if (!loop_end) + IE_THROW() << "LoopEndEmitter invoked with invalid op argument"; + loop_begin = loop_end->get_loop_begin(); + // todo: this check could be excessive, since we check for it in validate_and_infer_types() + if (!loop_begin) + IE_THROW() << "LoopEndEmitter invoked with invalid configuration: the last arg must be LoopBegin"; + // Note that 1 edge connects LoopBegin and LoopEnd + num_inputs = loop_begin->get_input_size(); + num_outputs = loop_end->get_output_size(); + wa_increment = loop_end->get_increment(); + work_amount = loop_end->get_work_amount(); + ptr_increments = loop_end->get_ptr_increments(); + finalization_offsets = loop_end->get_finalization_offsets(); + evaluate_once = loop_end->get_evaluate_once(); + for (int i = 0; i < num_inputs; i++) + io_data_size.push_back(static_cast(loop_begin->get_input_element_type(i).size())); + for (int i = 0; i < num_outputs; i++) + io_data_size.push_back(static_cast(loop_end->get_output_element_type(i).size())); + in_out_type_ = emitter_in_out_map::gpr_to_gpr; +} + +void LoopEndEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + validate_arguments(in, out, pool, gpr); + emit_impl(in, out, pool, gpr, nullptr); +} + + +void LoopEndEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (loop_begin->input_regs.size() != num_inputs) + IE_THROW() << "Invalid loop_begin->input_regs size: expected " << num_inputs << " got " << loop_begin->input_regs.size(); + if (out.size() != num_outputs) + IE_THROW() << "Invalid number of out arguments: expected " << num_outputs << " got " << out.size(); + if (in.size() != num_outputs + 1) + IE_THROW() << "Invalid number of in arguments: expected " << num_inputs + 1 << " got " << in.size(); + const auto io_size = num_inputs + num_outputs; + if (ptr_increments.size() != io_size) + IE_THROW() << "Invalid apply_increments size: expected " << io_size << " got " << ptr_increments.size(); + if (finalization_offsets.size() != io_size) + IE_THROW() << "Invalid finalization_offsets size: expected: " << io_size << " got " << finalization_offsets.size(); +} + +void LoopEndEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + std::vector data_ptr_reg_idxs(loop_begin->input_regs); + data_ptr_reg_idxs.reserve(num_inputs + num_outputs); + std::copy(out.begin(), out.end(), std::back_inserter(data_ptr_reg_idxs)); std::vector data_ptr_regs; transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs); - // todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool. - // we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter - auto local_gpr_pool = gpr_pool; - Reg64 reg_outer_amount = Reg64(static_cast(local_gpr_pool.back())); - local_gpr_pool.pop_back(); - Reg64 reg_inner_amount = Reg64(static_cast(local_gpr_pool.back())); - local_gpr_pool.pop_back(); - Label for_body; - const size_t outer_work_amount = jcp.scheduler_dims[0]; - if (outer_work_amount == 1) { - // emit code directly without looping over external dim - emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool); - } else if (outer_work_amount > 1) { - // We need to create a Loop in this case - h->mov(reg_outer_amount, outer_work_amount); - h->L(for_body); - { - emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool); - - // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers - // after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes). - // To overcome this limitation, we add appropriate negative offsets if necessary. - for (auto i = 0; i < num_params; i++) { - if (jcp.scheduler_offsets[i] != 0) { - h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]); - } - } - // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar) - h->sub(reg_outer_amount, 1); - h->cmp(reg_outer_amount, 1); - h->jge(for_body, CodeGenerator::T_NEAR); + Reg64 reg_work_amount = Reg64(in.back()); + if (!evaluate_once) { + for (int idx = 0; idx < data_ptr_regs.size(); idx++) { + if (ptr_increments[idx] != 0) + h->add(data_ptr_regs[idx], ptr_increments[idx] * io_data_size[idx]); } + h->sub(reg_work_amount, wa_increment); + h->cmp(reg_work_amount, wa_increment); + h->jge(loop_begin->begin_address); } -} -std::vector& TileEmitter::get_nested_code() { - return body; -} - -TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { - const auto tile = ov::as_type_ptr(n); - if (!tile) - IE_THROW() << "TileEmitter invoked with invalid op argument"; - body = tile->region; - if (body.empty()) - IE_THROW() << "TileEmitter is invoked with empty body"; - num_inputs = tile->num_inputs; - num_outputs = tile->num_outputs; - io_dims = tile->io_dims; - io_data_size = tile->io_data_size; - increment = tile->increment; - if (io_dims.size() != num_inputs + num_outputs) - IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()"; -} - -void TileEmitter::emit_code(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - validate_arguments(in, out, pool, gpr); - emit_impl(in, out, pool, gpr, nullptr); -} - -void TileEmitter::validate_arguments(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - if (in.size() != 1) - IE_THROW() << "TileEmitter got invalid number of inputs. Expected 1, got " << in.size(); - if (out.size() != io_dims.size()) - IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size(); -} - -void TileEmitter::emit_body(const std::vector& vec_pool, const std::vector& gpr_pool) const { - for (auto& code : body) - code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool); -} - -void TileEmitter::emit_ptr_increments(const std::vector& data_ptr_regs) const { - for (size_t i = 0; i < num_inputs + num_outputs; i++) { - // those with dims == 1 will be broadcasted, hence don't require increment - if (io_dims[i] != 1) - h->add(data_ptr_regs[i], increment * io_data_size[i]); + for (int idx = 0; idx < data_ptr_regs.size(); idx++) { + if (finalization_offsets[idx] != 0) + h->add(data_ptr_regs[idx], finalization_offsets[idx] * io_data_size[idx]); } } -void TileEmitter::emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& vec_pool, - const std::vector& gpr_pool, - const ov::intel_cpu::emitter_context *emit_context) const { - Reg64 work_amount = Reg64(static_cast(in[0])); - std::vector data_ptr_regs; - transform_idxs_to_regs(out, data_ptr_regs); - Label for_body; - // Note that: - // * Work amount must be set by TileScheduler that executes Tiles - // * TileScheduler executes Tile only if it has to perform >= 1 iterations - h->L(for_body); - emit_body(vec_pool, gpr_pool); - emit_ptr_increments(data_ptr_regs); - h->sub(work_amount, increment); - h->cmp(work_amount, increment); - h->jge(for_body, CodeGenerator::T_NEAR); -} - BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : jit_emitter(h, isa, n) { if (n->get_input_element_type(0) != n->get_output_element_type(0)) @@ -467,7 +544,9 @@ StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::c if (src_prc != dst_prc) IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); - count = ov::as_type_ptr(n)->get_count(); + const auto store = ov::as_type_ptr(n); + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count)); } @@ -494,7 +573,7 @@ void StoreEmitter::emit_isa(const std::vector &in, const std::vector::type; if (!store_emitter) IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; - store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void StoreEmitter::emit_data() const { @@ -506,7 +585,12 @@ LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu if (src_prc != dst_prc) IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); - count = ov::as_type_ptr(n)->get_count(); + const auto load = std::dynamic_pointer_cast(n); + if (!load) + IE_THROW() << "LoadEmitter expects Load snippets op"; + + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -533,7 +617,7 @@ void LoadEmitter::emit_isa(const std::vector &in, const std::vector::type; if (!load_emitter) IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; - load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void LoadEmitter::emit_data() const { @@ -543,8 +627,13 @@ void LoadEmitter::emit_data() const { BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { if (src_prc != dst_prc) - IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); + IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name(); + const auto broadcast_load = std::dynamic_pointer_cast(n); + if (!broadcast_load) + IE_THROW() << "BroadcastLoadEmitter expects BroadcastLoad snippets op"; + + byte_offset = broadcast_load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; } @@ -574,16 +663,18 @@ void BroadcastLoadEmitter::emit_isa(const std::vector &in, const std::ve // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, // key point here is not to add post-increment, it might be fixed by some other approach in future switch (src_prc.size()) { - case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg]); break; - case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg]); break; - case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg]); break; + case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + byte_offset]); break; + case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + byte_offset]); break; + case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + byte_offset]); break; default: assert(!"unsupported data type"); } } LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { - count = ov::as_type_ptr(n)->get_count(); + const auto load = ov::as_type_ptr(n); + count = load->get_count(); + byte_offset = load->get_offset(); in_out_type_ = emitter_in_out_map::gpr_to_vec; load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count)); } @@ -608,7 +699,7 @@ template void LoadConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { if (!load_emitter) IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!"; - load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void LoadConvertEmitter::emit_data() const { @@ -617,7 +708,9 @@ void LoadConvertEmitter::emit_data() const { StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : MemoryEmitter(h, isa, n) { - count = ov::as_type_ptr(n)->get_count(); + const auto store = ov::as_type_ptr(n); + count = store->get_count(); + byte_offset = store->get_offset(); in_out_type_ = emitter_in_out_map::vec_to_gpr; if (ov::is_type(n)) { @@ -647,12 +740,477 @@ template void StoreConvertEmitter::emit_isa(const std::vector &in, const std::vector &out) const { if (!store_emitter) IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!"; - store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); + store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs); } void StoreConvertEmitter::emit_data() const { store_emitter->emit_data(); } +size_t BrgemmEmitter::getBrgIdx(size_t mIdx, size_t kIdx, size_t nIdx) const { + return mIdx * 4 + kIdx * 2 + nIdx; +} +BrgemmEmitter::BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& node) : jit_emitter(h, isa, node) { + in_out_type_ = emitter_in_out_map::gpr_to_gpr; + const auto& brgemm_node = as_type_ptr(node); + if (brgemm_node->is_dynamic()) + IE_THROW() << "Snippets don't support code generation for dynamic Brgemm"; + const OutputVector io_values {brgemm_node->input_value(0), brgemm_node->input_value(1), brgemm_node->output(0)}; + std::vector leading_dimensions; + std::vector> io_layouts; + for (const auto& val : io_values) { + const auto& layout = ngraph::snippets::utils::get_node_output_layout(val.get_node_shared_ptr()); + const auto& io_shape = val.get_shape(); + if (layout.empty()) { + // empty value indicates a planar layout + leading_dimensions.push_back(io_shape.back()); + std::vector default_layout(io_shape.size()); + std::iota(default_layout.begin(), default_layout.end(), 0); + io_layouts.push_back(default_layout); + } else { + // The idea here is to find "2" (for 4D shapes) in the layout and multiply dimensions that are to the right + // This implies that "3" is the last layout value, otherwise this layout is not supported. + // counting from the end since shape could be prepended with ones + const int64_t num_last_dims = layout.end() - std::find(layout.begin(), layout.end(), layout.size() - 2) - 1; + if (layout.back() != layout.size() - 1 || num_last_dims < 1) + IE_THROW() << "BrgemmEmitter detected invalid layout values: " << + "check that this shape + layout combination is schedulable"; + leading_dimensions.emplace_back( + std::accumulate(io_shape.end() - num_last_dims, io_shape.end(), 1, std::multiplies())); + io_layouts.push_back(layout); + } + } + // todo: leave AMX and VNNI related code for now, it'll help to enable int8 and bf16 support + bool isAMXSupported = mayiuse(avx512_core_bf16_amx_int8) || mayiuse(avx512_core_bf16_amx_bf16); + + const auto& A_shape = io_values[0].get_shape(); + const auto& A_layout = io_layouts[0]; + const auto& C_shape = io_values[2].get_shape(); + const auto& C_layout = io_layouts[2]; + + M = C_shape[C_layout[2]]; + K = A_shape[A_layout[3]]; + M_blk = matmulOptimalM; + M_tail = M % M_blk; + // B_shape[B_layout[3]] + N = C_shape[C_layout[3]]; + + auto brg0Prc = InferenceEngine::details::convertPrecision(brgemm_node->get_input_element_type(0)); + auto brg1Prc = InferenceEngine::details::convertPrecision(brgemm_node->get_input_element_type(1)); + io_data_size = {brg0Prc.size(), brg1Prc.size(), brgemm_node->get_output_element_type(0).size()}; + brg0VnniFactor = 4 / brg0Prc.size(); + bool brg0WithAMX = isAMXSupported && brg0Prc != Precision::FP32 && (K % brg0VnniFactor == 0) && (N % brg0VnniFactor == 0); + + N_blk = brg0Prc == Precision::FP32 ? N : + brg0Prc == Precision::BF16 ? 32 : 64; + N_tail = N % N_blk; + K_blk = brg0WithAMX ? brg0Prc == Precision::BF16 ? 32 : 64 + : K; + K_tail = K % K_blk; + + size_t brg0BaseIdx = -1; + for (size_t m = 0; m < 2; m++) { + for (size_t k = 0; k < 2; k++) { + for (size_t n = 0; n < 2; n++) { + auto& brgemmCtx = brgCtxs0[getBrgIdx(m, k, n)]; + + auto M_ = m ? M_tail + : M < M_blk ? 0 : M_blk; + auto N_ = n ? N_tail : N - N_tail; + auto K_ = k ? K_tail : K - K_tail; + auto beta = k && brgCtxs0[getBrgIdx(m, 0, n)].K != 0 ? 1.0f : 0.0f; + + brgemmCtx.M = M_; + brgemmCtx.N = N_; + brgemmCtx.K = K_; + brgemmCtx.LDA = leading_dimensions[0]; + brgemmCtx.LDB = leading_dimensions[1]; + brgemmCtx.LDC = leading_dimensions[2]; + brgemmCtx.dt_in0 = static_cast(DnnlExtensionUtils::IEPrecisionToDataType(brg0Prc)); + brgemmCtx.dt_in1 = static_cast(DnnlExtensionUtils::IEPrecisionToDataType(brg1Prc)); + brgemmCtx.beta = beta; + + // don't create brgemm kernels for empty tiles + if (M_ != 0 && K_ != 0 && N_ != 0) { + if (brg0BaseIdx == -1) + brg0BaseIdx = getBrgIdx(m, k, n); + initBrgemm(brgemmCtx, brgKernels0[getBrgIdx(m, k, n)], brg0WithAMX); + } + } + } + } + + load_offset_a = brgemm_node->get_offset_a(); + load_offset_b = brgemm_node->get_offset_b(); + store_offset_c = brgemm_node->get_offset_c(); +} + +void BrgemmEmitter::initBrgemm(brgemmCtx& ctx, std::unique_ptr& brgKernel, bool use_amx) const { + brgemm_t brgDesc; + brgemm_strides_t strides {static_cast(ctx.M * ctx.K), static_cast(ctx.K * ctx.N)}; + // When implementing int8 support, note that isa logics is more complicated in the MHA node + auto status = brgemm_desc_init(&brgDesc, host_isa_, brgemm_strd, ctx.dt_in0, ctx.dt_in1, + false, false, brgemm_row_major, 1.f, ctx.beta, ctx.LDA, ctx.LDB, ctx.LDC, ctx.M, ctx.N, ctx.K, &strides); + if (status != dnnl_success) + IE_THROW() << "BrgemmEmitter cannot initialize brgemm descriptor due to invalid params"; + + ctx.is_with_amx = use_amx; + status = brgemm_init_tiles(brgDesc, ctx.palette); + if (use_amx) + amx_tile_configure(ctx.palette); + + ctx.is_with_comp = ctx.dt_in0 == dnnl_data_type_t::dnnl_s8 && !ctx.is_with_amx; + + brgemm_kernel_t* brgKernel_ = nullptr; + status = brgemm_kernel_create(&brgKernel_, brgDesc); + if (status != dnnl_success) + IE_THROW() << "BrgemmEmitter cannot create brgemm kernel due to invalid params"; + brgKernel.reset(brgKernel_); +} + +void BrgemmEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == cpu::x64::sse41 || host_isa_ == cpu::x64::avx2) { + IE_THROW() << "BrgemmEmitter requires at least avx512_core instruction set"; + } else if (host_isa_ == cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + assert(!"unsupported isa"); + } +} +template +void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, int bs, + Reg64 addr_A, Reg64 addr_B, + const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch, + const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + size_t gpr_size = 8; + Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, + h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); + + h->sub(h->rsp, n_gprs_to_save * gpr_size); + for (size_t i = 0; i < n_gprs_to_save; ++i) + h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]); + + // caller obligation to save k-regs as callee may use them + size_t n_k_regs_to_save = 8; + if (isa == cpu::x64::avx512_core) { + h->sub(h->rsp, n_k_regs_to_save * k_mask_size); + for (size_t i = 0; i < n_k_regs_to_save; ++i) { + if (mayiuse(avx512_core)) + h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast(i))); + else + h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast(i))); + } + } + + // 1. Caller obligation to save vector registers as callee may use them. + // 2. There is an implicit assumption that the host code uses the same + // `isa` as the injector. Once the assumption is wrong, `vecs_count` and + // `vlen` should be replaced with `host_isa::vlen` and + // `host_isa::vecs_count`. + h->sub(h->rsp, get_max_vecs_count() * get_vec_length()); + for (size_t i = 0; i < get_max_vecs_count(); ++i) + h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i)); + + // save function address in gpr to pass in call instruction + const auto& brgemm_kernel_overload = static_cast(kernel_execute); + h->mov(h->rbp, reinterpret_cast(brgemm_kernel_overload)); + // todo: several of addr_{A, B, C} could be also abi_paramX, so one of them could be corrupted + // if moving directly h->uni_vmovq(abi_paramX, adr_X). Save them to vector regs to avoid corruption. + // It's likely that a more efficient solution exists. + h->uni_vmovq(Xmm(0), addr_A); + h->uni_vmovq(Xmm(1), addr_B); + h->uni_vmovq(Xmm(2), addr_C); + + const auto data_ptr_reg = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) { + h->uni_vmovq(reg, xmm); + if (bytes_offset) h->add(reg, bytes_offset); + }; + h->mov(abi_param1, reinterpret_cast(brgKernel)); + data_ptr_reg(Xmm(0), abi_param2, in0_kernel_offset); + data_ptr_reg(Xmm(1), abi_param3, in1_kernel_offset); + data_ptr_reg(Xmm(2), abi_param4, out0_kernel_offset); + + // align stack on 16-byte as ABI requires + // note that RBX must not be changed by the callee + h->mov(h->rbx, h->rsp); + h->and_(h->rbx, 0xf); + h->sub(h->rsp, h->rbx); + + h->call(h->rbp); + + h->add(h->rsp, h->rbx); + // restore vector registers + for (int i = static_cast(get_max_vecs_count()) - 1; i >= 0; --i) { + h->uni_vmovups(Vmm(i), h->ptr[h->rsp + i * get_vec_length()]); + } + h->add(h->rsp, (get_max_vecs_count()) * get_vec_length()); + + // restore k registers + if (isa == cpu::x64::avx512_core) { + for (int i = n_k_regs_to_save - 1; i >= 0; --i) { + if (mayiuse(avx512_core)) + h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); + else + h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); + } + h->add(h->rsp, n_k_regs_to_save * k_mask_size); + } + + // restore gpr registers + for (int i = n_gprs_to_save - 1; i >= 0; --i) + h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); + h->add(h->rsp, n_gprs_to_save * gpr_size); +} + +void BrgemmEmitter::kernel_execute(const brgemm_kernel_t *brg_kernel, const void *A, const void *B, void *C) { + // TODO: There are 4 available abi_params on Windows so we have the copy of brgemm_kernel_execute() function + // with 4 runtime parameters (kernel and I/O) and 4 default parameter values (batch, bs and scratch) + brgemm_kernel_params_t brgemm_p; + + brgemm_p.batch = nullptr; // default value + brgemm_p.ptr_A = A; + brgemm_p.ptr_B = B; + brgemm_p.ptr_C = C; + brgemm_p.ptr_D = C; + brgemm_p.ptr_buf = nullptr; // default value + brgemm_p.ptr_bias = nullptr; + brgemm_p.do_post_ops = 0; + brgemm_p.do_apply_comp = 0; + brgemm_p.skip_accm = 0; + brgemm_p.BS = 1; // default value + assert(brg_kernel); + (*brg_kernel)(&brgemm_p); +} + +template +void BrgemmEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + Reg64 input_0(static_cast(in[0])); + Reg64 input_1(static_cast(in[1])); + Reg64 output_0(static_cast(out[0])); + + for (size_t mb = 0; mb < div_up(M, M_blk); mb++) { + const bool is_M_tail = (M - mb * M_blk < M_blk); + + size_t brgIdx0 = getBrgIdx(0, 0, 0); + size_t K0_step0 = brgCtxs0[brgIdx0].K; + size_t K0_step1 = brgCtxs0[brgIdx0].K * brgCtxs0[brgIdx0].LDB; + size_t N0_step0 = brgCtxs0[brgIdx0].N * brg0VnniFactor; + size_t N0_step1 = brgCtxs0[brgIdx0].N; + for (size_t n = 0; n < 2; n++) { + for (size_t k = 0; k < 2; k++) { + size_t mIdx = is_M_tail ? 1 : 0; + auto& brgemmCtx = brgCtxs0[getBrgIdx(mIdx, k, n)]; + + if (brgemmCtx.K != 0 && brgemmCtx.N != 0) { + const size_t in0_offset = load_offset_a + (k * K0_step0 + mb * M_blk * brgemmCtx.LDA) * io_data_size[0]; + const size_t in1_offset = load_offset_b + (k * K0_step1 + n * N0_step0) * io_data_size[1]; + const size_t out0_offset = store_offset_c + (n * N0_step1 + mb * M_blk * brgemmCtx.LDC) * io_data_size[2]; + + emit_brgemm_kernel_call(brgKernels0[getBrgIdx(mIdx, k, n)].get(), + 1, + input_0, + input_1, + nullptr, + output_0, + nullptr, + in0_offset, + in1_offset, + out0_offset); + } + } + } + } +} + +HorizonMaxEmitter::HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {} + +void HorizonMaxEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "HorizonMax emitter doesn't support " << host_isa_; + } +} + +template +void HorizonMaxEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Xmm dst_xmm = Xmm(out[0]); + Xmm aux_xmm = Xmm(aux_vec_idxs[0]); + + Reg64 aux_reg = Reg64(aux_gpr_idxs[0]); + Reg32 aux_reg_32 = Reg32(aux_reg.getIdx()); + + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const size_t vec_size = vlen / sizeof(float); + h->sub(h->rsp, vlen); + h->uni_vmovups(h->ptr[h->rsp], src_vmm); + // Let the first value be the max + h->mov(aux_reg, h->ptr[h->rsp]); + h->vmovq(dst_xmm, aux_reg); + for (size_t i = 1; i < vec_size; i++) { + h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]); + h->vmovq(aux_xmm, aux_reg); + h->uni_vmaxps(dst_xmm, dst_xmm, aux_xmm); + } + h->add(h->rsp, vlen); +} + +HorizonSumEmitter::HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {} + +void HorizonSumEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "HorizonSum emitter doesn't support " << host_isa_; + } +} + +template +void HorizonSumEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Xmm dst_xmm = Xmm(out[0]); + Xmm aux_xmm = Xmm(aux_vec_idxs[0]); + + Reg64 aux_reg = Reg64(aux_gpr_idxs[0]); + Reg32 aux_reg_32 = Reg32(aux_reg.getIdx()); + + const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const size_t vec_size = vlen / sizeof(float); + h->sub(h->rsp, vlen); + h->uni_vmovups(h->ptr[h->rsp], src_vmm); + h->uni_vpxor(dst_xmm, dst_xmm, dst_xmm); + for (size_t i = 0; i < vec_size; i++) { + h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]); + h->vmovq(aux_xmm, aux_reg); + h->uni_vaddps(dst_xmm, dst_xmm, aux_xmm); + } + h->add(h->rsp, vlen); +} + +VectorBufferEmitter::VectorBufferEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {} + +void VectorBufferEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Zero emitter doesn't support " << host_isa_; + } +} + +template +void VectorBufferEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm vmm = Vmm(out[0]); + h->uni_vpxor(vmm, vmm, vmm); +} + +FillEmitter::FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) : + jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) { + const auto fill = ov::as_type_ptr(n); + if (fill->get_element_type().size() != 4) { + IE_THROW() << "Fill emitter supports only 4 Byte element types but gets: " << fill->get_element_type(); + } + + offset = fill->get_offset(); + fill_value = fill->get_fill_value(); + prepare_table(); +} + +size_t FillEmitter::aux_gprs_count() const { + // + 1 reg for temp reg for mask in avx512 + return one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core) ? 2 : 1; +} + +void FillEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + if (host_isa_ == dnnl::impl::cpu::x64::sse41) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { + emit_isa(in, out); + } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) { + emit_isa(in, out); + } else { + IE_THROW() << "Fill emitter doesn't support " << host_isa_; + } +} + +template +void FillEmitter::emit_isa(const std::vector &in, const std::vector &out) const { + using Vmm = typename dnnl::impl::utils::conditional3::type; + + Vmm src_vmm = Vmm(in[0]); + Vmm dst_vmm = Vmm(out[0]); + + if (one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core)) { + uint64_t tail_mask = 1; + tail_mask = ~((tail_mask << offset) - tail_mask); + h->mov(Reg64(aux_gpr_idxs[0]), tail_mask); + h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); + h->vblendmps(dst_vmm | k_mask, src_vmm, table_val("value")); + } else if (one_of(host_isa_, dnnl::impl::cpu::x64::avx2, dnnl::impl::cpu::x64::sse41)) { + uint8 imm = 1; + imm = ~((imm << offset) - imm); // shift load_num bit + if (host_isa_ == dnnl::impl::cpu::x64::sse41 && src_vmm.getIdx() != dst_vmm.getIdx()) { + h->uni_vmovups(dst_vmm, src_vmm); + src_vmm = Vmm(dst_vmm.getIdx()); + } + h->uni_vblendps(dst_vmm, src_vmm, table_val("value"), imm); + } +} + +void FillEmitter::register_table_entries() { + push_arg_entry_of("value", fill_value, true); +} } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index adfd88dfedd..51b2d2d7840 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -12,6 +12,11 @@ #include "jit_load_store_emitters.hpp" #include "snippets_transformations/op/store_convert.hpp" +// Matmul support: +#include +#include +#include +#include using namespace Xbyak; using ngraph::snippets::AllocatedEmitter; @@ -23,47 +28,49 @@ namespace intel_cpu { #define SNIPPETS_MAX_SNIPPETS_DIMS 12 #define SNIPPETS_MAX_HARNESS_DIMS 5 #define SNIPPETS_MAX_TILE_RANK 2 +#define SNIPPETS_DYNAMIC_MASTER_SHAPE_RANK 6 #define GET_OFF(field) offsetof(jit_snippets_call_args, field) struct jit_snippets_call_args { const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; void *dst_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; + void *buffer_scratchpad_ptr = nullptr; }; struct jit_snippets_compile_args { - int64_t scheduler_dims[SNIPPETS_MAX_TILE_RANK] = {}; - int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; - int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {}; - std::vector output_dims = {}; + std::vector master_shape{}; + size_t tile_rank = 0; }; /// -/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter, -/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping +/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (for example, KernelEmitter) +/// This is needed to provide common interface for register mapping /// (abstract to physical) and nested code access. /// class jit_container_emitter: public jit_emitter { public: jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + // mapping info contains abstract_to_physical map + regs_pool + using mapping_info = std::pair, std::vector&>; protected: // maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools // (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args). - void map_abstract_registers(const std::vector&, const std::vector&, - std::set&, std::set&); + void map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, + std::vector& allocated_emitters) const; std::vector body; }; /// /// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register -/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one) -/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way: -/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */ -/// TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */ -/// TileEmitter { /* inner vector tile */ -/// ... /* All the necessary Load/Strore/elementwise emitters */ -/// } -/// TileEmitter { /* inner scalar tile for tail processing */ -/// ... /* All the necessary Load/Strore/elementwise emitters */ -/// } -/// } +/// mapping and creates a pools of available gpr and vec registers. Kernel usually contains (at least one) +/// LoopBeginEmitter and LoopEndEmitter pair. In general the enclosed emitters should be organized in the following way: +/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */ +/// 1.S LoopBeginEmitter /* Scalar Loop over the outer dimension [START] */ +/// 2.S LoopBeginEmitter /* inner vector loop [START] */ +/// ... /* All the necessary Load/Strore/elementwise emitters */ +/// 2.E LoopEndEmitter /* inner vector loop [END] */ +/// 3.S LoopBeginEmitter /* inner scalar loop for tail processing [START]*/ +/// ... /* All the necessary Load/Strore/elementwise emitters */ +/// 3.E LoopEndEmitter /* inner scalar loop for tail processing [END]*/ +/// 1.E LoopEndEmitter /* Scalar Loop over the outer dimension [END] */ /// } /// Note that Kernel doesn't accept any input arguments. /// @@ -88,33 +95,36 @@ private: const std::vector& pool, const std::vector& gpr, const ov::intel_cpu::emitter_context *emit_context) const override; - void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector&) const; + void init_data_pointers(size_t, size_t, bool, const Reg64&, const Reg64&, const std::vector&) const; jit_snippets_compile_args jcp; std::vector gp_regs_pool; - std::vector gp_regs_used; + size_t num_inputs; + size_t num_outputs; + bool is_buffer_needed; + // Vector of indices (lenght = input tensor rank) per every input and output that describes in which order + // corresponding tensor dimensions are accessed (default: consecutive dense, e.g. 0,1,2,3 for 4D tensor). + // Needed to calc i/o offsets. + std::vector> data_layout; + std::vector> io_shapes = {}; + std::vector io_data_size {}; + + // gpr's used to store data pointers, track them to apply offsets in Kernel + std::vector data_ptr_regs_idx; std::vector vec_regs_pool; + const size_t reg_indexes_idx = abi_param1.getIdx(); + const size_t reg_const_params_idx = abi_param2.getIdx(); }; -/// -/// \brief TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets -/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector -/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required. -/// -/// \param in[0] The number of the node inputs -/// \param in[1] The number of the node outputs -/// \param in[2] The number of elements that fits into vector register -/// -class TileSchedulerEmitter : public jit_container_emitter { +class LoopBeginEmitter : public jit_emitter { public: - TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n); - - size_t get_inputs_num() const override {return 0;} + LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); void emit_code(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const override; + // todo: it is purely virtual in the base class, but do we need it? + size_t get_inputs_num() const override {return 0;} private: void validate_arguments(const std::vector &in, @@ -127,50 +137,49 @@ private: const std::vector& gpr, const ov::intel_cpu::emitter_context *emit_context) const override; - void emit_tiles(const Reg64&, const std::vector&, size_t, const std::vector& , const std::vector&) const; - - jit_snippets_compile_args jcp; + std::shared_ptr loop_begin; + size_t num_inputs = 0; + bool evaluate_once = false; + size_t work_amount = 0; // need to store work_amount explicitly, since two loops can work on the same dim (e.g. vector + scalar) }; -/// -/// \brief Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop: -/// it performs operations specified by enclosed emitters, advances iteration counters -/// and breaks when necessary. -/// -/// \param in[0] The number of input entities (or scheduler counts) processed during one iteration of the tile. -/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles. -class TileEmitter : public jit_container_emitter { +class LoopEndEmitter : public jit_emitter { public: - TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); - - size_t get_inputs_num() const override {return 0;} - std::vector& get_nested_code(); + LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); void emit_code(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const override; - - void emit_body(const std::vector& vec_pool, const std::vector& gpr_pool) const; - void emit_ptr_increments(const std::vector& data_ptr_regs) const; + // todo: it is purely virtual in the base class, but do we need it? + size_t get_inputs_num() const override {return 0;} private: void validate_arguments(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const override; + void emit_impl(const std::vector& in, const std::vector& out, const std::vector& pool, const std::vector& gpr, const ov::intel_cpu::emitter_context *emit_context) const override; + std::shared_ptr loop_begin; + std::shared_ptr loop_end; + size_t num_inputs = 0; size_t num_outputs = 0; - std::vector io_dims {}; - std::vector io_data_size {}; - size_t increment = 0; + // keep data_size int64_t to avoid conversion to size_t (and overflow) when multiplied by negative increments or offsets + std::vector io_data_size {}; + size_t wa_increment = 0; + size_t work_amount = 0; + bool evaluate_once = false; + std::vector ptr_increments; + std::vector finalization_offsets; }; + class NopEmitter : public jit_emitter { public: NopEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) @@ -205,7 +214,6 @@ private: void emit_isa(const std::vector &in, const std::vector &out) const; private: - bool use_broadcast; size_t byte_size = 0lu; }; @@ -239,7 +247,7 @@ private: /// it's illigal to load/store to the same address multiple times /// Typical application can be if Load and BroadcastLoad are performed from the same pointer. /// If Load goes before BroadcastLoad topologicaly the resilt will be incorrect -/// For scalar loads we can use different tiles. Tiling indeed can be arbitrary and post increment should be somehow coded into ISA. +/// For scalar loads we can use different loops. Tiling indeed can be arbitrary and post increment should be somehow coded into ISA. /// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load. class MemoryEmitter : public jit_emitter { public: @@ -248,6 +256,9 @@ public: protected: Precision src_prc; Precision dst_prc; + + size_t count = 0; + size_t byte_offset = 0; }; class StoreEmitter : public MemoryEmitter { @@ -268,7 +279,6 @@ private: void emit_data() const override; private: - size_t count; std::unique_ptr store_emitter = nullptr; }; @@ -290,7 +300,6 @@ private: void emit_data() const override; private: - size_t count; std::unique_ptr load_emitter = nullptr; }; @@ -329,7 +338,6 @@ private: void emit_data() const override; private: - size_t count; std::unique_ptr load_emitter = nullptr; }; @@ -351,9 +359,141 @@ private: void emit_data() const override; private: - size_t count; std::unique_ptr store_emitter = nullptr; }; +class BrgemmEmitter : public jit_emitter { +public: + BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 2;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; + std::vector io_data_size {}; + struct brgemmCtx { + size_t M, N, K, LDA, LDB, LDC; + dnnl_data_type_t dt_in0, dt_in1; + char palette[64]; + bool is_with_amx; + bool is_with_comp; + float beta; + }; + void initBrgemm(brgemmCtx& ctx, std::unique_ptr& brgKernel, bool use_amx) const; + template + void callBrgemm(brgemmCtx& ctx, std::unique_ptr& brgKernel, const void* pin0, const void* pin1, void* pout, void* wsp) const; + size_t getBrgIdx(size_t mIdx, size_t kIdx, size_t nIdx) const; + template + void emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, int bs, + Reg64 addr_A, Reg64 addr_B, + const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch, + const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const; + static void kernel_execute(const brgemm_kernel_t *brg_kernel, const void *A, const void *B, void *C); + static constexpr size_t BRGEMM_KERNELS_NUM = 8; + static constexpr size_t matmulOptimalM = 32; + brgemmCtx brgCtxs0[BRGEMM_KERNELS_NUM]; + std::unique_ptr brgKernels0[BRGEMM_KERNELS_NUM]; + + size_t M, M_blk, M_tail; + size_t K, K_blk, K_tail; + size_t N, N_blk, N_tail; + size_t brg0VnniFactor; + + size_t load_offset_a = 0lu; + size_t load_offset_b = 0lu; + size_t store_offset_c = 0lu; +}; + +class HorizonMaxEmitter : public jit_emitter { +public: + HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override {return 1;} + size_t aux_vecs_count() const override {return 1;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class HorizonSumEmitter : public jit_emitter { +public: + HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override {return 1;} + size_t aux_vecs_count() const override {return 1;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class VectorBufferEmitter : public jit_emitter { +public: + VectorBufferEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 0;} + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; +}; + +class FillEmitter : public jit_emitter { +public: + FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + + size_t get_inputs_num() const override {return 1;} + +protected: + size_t aux_gprs_count() const override; + +private: + void emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in, const std::vector &out) const; + + void register_table_entries() override; + + size_t offset = 0; + uint32_t fill_value = 0x0; +}; + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index eafce8f3aeb..2e27effd8b5 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -131,19 +131,26 @@ std::map Extension::getOpSets() { ngraph::OpSet opset; #define NGRAPH_OP(NAME, NAMESPACE) opset.insert(); + NGRAPH_OP(Brgemm, ngraph::snippets::op) NGRAPH_OP(BroadcastLoad, ngraph::snippets::op) NGRAPH_OP(BroadcastMove, ngraph::snippets::op) + NGRAPH_OP(Buffer, ngraph::snippets::op) NGRAPH_OP(ConvertSaturation, ngraph::snippets::op) NGRAPH_OP(ConvertTruncation, ngraph::snippets::op) + NGRAPH_OP(Fill, ngraph::snippets::op) + NGRAPH_OP(HorizonMax, ngraph::snippets::op) + NGRAPH_OP(HorizonSum, ngraph::snippets::op) NGRAPH_OP(Kernel, ngraph::snippets::op) NGRAPH_OP(Load, ngraph::snippets::op) + NGRAPH_OP(LoadReshape, ngraph::snippets::op) + NGRAPH_OP(LoopBegin, ngraph::snippets::op) + NGRAPH_OP(LoopEnd, ngraph::snippets::op) NGRAPH_OP(Nop, ngraph::snippets::op) NGRAPH_OP(PowerStatic, ngraph::snippets::op) NGRAPH_OP(Scalar, ngraph::snippets::op) NGRAPH_OP(Store, ngraph::snippets::op) NGRAPH_OP(Subgraph, ngraph::snippets::op) - NGRAPH_OP(Tile, ngraph::snippets::op) - NGRAPH_OP(TileScheduler, ngraph::snippets::op) + NGRAPH_OP(VectorBuffer, ngraph::snippets::op) #undef NGRAPH_OP return opset; diff --git a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp index 54f046c984a..71fdc039a4b 100644 --- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp +++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // #include "snippets_mark_skipped.hpp" -#include "snippets/pass/collapse_subgraph.hpp" +#include "snippets/pass/tokenization.hpp" #include "snippets/op/subgraph.hpp" #include "snippets/utils.hpp" #include @@ -81,7 +81,7 @@ bool canBePerformedAsScaleShift(const std::shared_ptr &node, const i fusingPort = i; dataShape = node->get_input_partial_shape(i); // only one non-const parent is allowed - if (dataShape.is_dynamic() || ++numNonConstInputs != 1) + if (++numNonConstInputs != 1) return false; } else { // every const parent must have exactly one child @@ -97,8 +97,7 @@ bool canBePerformedAsScaleShift(const std::shared_ptr &node, const i if (i == fusingPort) continue; const ov::PartialShape weightShape = node->get_input_partial_shape(i); - if (weightShape.is_dynamic() || - !isPerTensorOrPerChannelBroadcastable(dataShape.get_shape(), weightShape.get_shape(), channelAxis, true)) + if (!isPerTensorOrPerChannelBroadcastable(dataShape.get_max_shape(), weightShape.get_max_shape(), channelAxis, true)) return false; } return true; @@ -250,22 +249,20 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr &node, con NodeFusingType &updatedChainType, int& fusingAxis) { int num_non_const_inputs = 0; bool can_be_converted_to_FC = false; - ov::Shape bias_shape; - ov::Shape matmul_shape; + ov::PartialShape bias_shape; + ov::PartialShape matmul_shape; for (const auto &parent_out : node->input_values()) { const auto parent = parent_out.get_node_shared_ptr(); if (ngraph::op::is_constant(parent)) { bias_shape = parent_out.get_shape(); num_non_const_inputs++; } else { - const auto pshape = parent_out.get_partial_shape(); - if (pshape.is_dynamic() || pshape.get_shape().empty()) + matmul_shape = parent_out.get_partial_shape(); + if (matmul_shape.size() == 0) return false; - matmul_shape = pshape.get_shape(); const auto& grandparents = parent->input_values(); // first check that weights are constant and both activations and weights have static shape if (grandparents.size() == 2 && - grandparents[0].get_partial_shape().is_static() && grandparents[1].get_partial_shape().is_static() && ov::is_type(grandparents[1].get_node_shared_ptr())) { auto rank_a = grandparents[0].get_partial_shape().rank().get_length(); @@ -280,8 +277,9 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr &node, con // Matmul / FC bias fusion if (ov::is_type(node) && - bias_shape.back() == matmul_shape.back() && - bias_shape.back() == shape_size(bias_shape)) { + bias_shape.rbegin()->get_length() == matmul_shape.rbegin()->get_length() && + bias_shape.is_static() && + bias_shape.rbegin()->get_length() == shape_size(bias_shape.get_shape())) { return true; } @@ -431,7 +429,7 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr &m) { RUN_ON_MODEL_SCOPE(SnippetsMarkSkipped); int channelAxis = DEFAULT_AXIS; for (auto &node : m->get_ordered_ops()) { - if (ngraph::op::is_constant(node)) + if (ngraph::op::is_constant(node) || ov::is_type(node)) continue; if (isSuitableConvolutionParent(node)) { // Initiate fusing chain diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 0443416f356..71e13fe7f07 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -22,6 +22,7 @@ #include #include "emitters/cpu_generator.hpp" +#include "utils/cpu_utils.hpp" #include "snippets_transformations/fuse_load_store_and_convert.hpp" #include "ngraph_transformations/convert_to_swish_cpu.hpp" @@ -64,6 +65,7 @@ void Snippet::copy_snippet() { ngraph::copy_runtime_info(original_snippet, snippet); snippet->set_friendly_name(original_snippet->get_friendly_name()); snippet->set_generator(std::make_shared(host_isa)); + isa_num_lanes = snippet->get_generator()->get_target_machine()->get_lanes(); } void Snippet::initSupportedPrimitiveDescriptors() { @@ -82,11 +84,19 @@ void Snippet::initSupportedPrimitiveDescriptors() { } const size_t ndims = outputShapes[0].getRank(); - const bool isChannelsFirstApplicable = dnnl::impl::utils::one_of(ndims, 1, 2, 3, 4, 5) && dimRanksAreEqual; + // Domain sensitive operations support only Planar layout + const bool isOnlyPlanarApplicable = snippet->has_domain_sensitive_ops(); + const bool isChannelsFirstApplicable = dnnl::impl::utils::one_of(ndims, 1, 2, 3, 4, 5) && dimRanksAreEqual && !isOnlyPlanarApplicable; // Todo: Snippets currently don't support per-channel broadcasting of Blocked descriptors because // canonicalization can't distinguish between and cases. // See snippets::op::Subgraph::canonicalize for details. - const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims, 4, 5) && dimRanksAreEqual; + bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims, 4, 5) && dimRanksAreEqual && !isOnlyPlanarApplicable; + + for (const auto& inShape : inputShapes) { + if (isDynamic && inShape.getRank() != 1) + isBlockedApplicable = isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1; + } + enum LayoutType { Planar, ChannelsFirst, @@ -189,42 +199,6 @@ void Snippet::initSupportedPrimitiveDescriptors() { void Snippet::selectOptimalPrimitiveDescriptor() { selectPreferPrimitiveDescriptor(getPrimitivesPriority(), true); } - -void Snippet::createPrimitive() { - // schedule definition part - // it defines offsets, strides and sizes for snippet kernel scheduling - define_schedule(); - - // code generation part - // it might be worth to generate explicitly for scheduler work amount for now, - // but in future some interface should be defined in order to communicate schedule for a kernel - // or generate schedule for a kernel. - // Here kernel is generated for most warying dimension by default. - generate(); -} - -void Snippet::execute(dnnl::stream strm) { - if (schedule.ptr == nullptr || !canUseOptimizedImpl) { - IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference"; - } - jit_snippets_call_args call_args; - for (size_t i = 0; i < srcMemPtrs.size(); i++) - call_args.src_ptrs[i] = reinterpret_cast(srcMemPtrs[i]->GetData()) + start_offset_in[i]; - - for (size_t i = 0; i < dstMemPtrs.size(); i++) - call_args.dst_ptrs[i] = reinterpret_cast(dstMemPtrs[i]->GetData()) + start_offset_out[i]; - - if (tensorRank == rank6D) { - schedule_6d(call_args); - } else { - schedule_nt(call_args); - } -} - -bool Snippet::created() const { - return getType() == Type::Subgraph; -} - InferenceEngine::Precision Snippet::getRuntimePrecision() const { std::vector inputPrecisions; for (size_t i = 0; i < getParentEdges().size(); i++) { @@ -237,11 +211,268 @@ InferenceEngine::Precision Snippet::getRuntimePrecision() const { return getMaxPrecision(inputPrecisions); } -bool Snippet::canBeInPlace() const { - if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) { +bool Snippet::optimizeExecDomain(std::vector& inputShapes, std::vector& outputShapes, + VectorDims &domain, size_t& TileRank) const { + const size_t minimalConcurrency = parallel_get_max_threads(); + const size_t minimalJitWorkAmount = 256; + const size_t ds = domain.size(); + if ( ds <= 2 || // not enough dimensions to collapse + domain[ds-1] >= minimalJitWorkAmount || // There is enough work for 1D Tiles, no need to collapse + domain[ds-1] * domain[ds-2] >= fullWorkAmount / minimalConcurrency) // There won't be enough work for every thread (even one iter) if we collapse return false; + auto findDimsToCollapse = [&]() { + auto collapseLastDims = [](VectorDims& dims, size_t dimsToCollapse) { + if (dimsToCollapse >= dims.size() - 1) + IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse; + for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { + dims[dims.size() - 1] *= dims[i]; + } + + for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { + dims[i] = dims[i - dimsToCollapse]; + } + + for (int i = dimsToCollapse - 1; i >= 0; i--) { + dims[i] = 1; + } + }; + int collapsedDims = 0; + size_t currentJitWorkAmount = domain[domain.size() - 1]; + while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) { + if (static_cast(domain.size()) - collapsedDims - 2 < 0) + break; + + bool canCollapse = true; + for (size_t i = 0; i < inputShapes.size(); i++) { + const size_t last = inputShapes[i].size() - 1; + if ((inputShapes[i][last - 1] != 1 && inputShapes[i][last] == 1) || + (inputShapes[i][last - 1] == 1 && inputShapes[i][last] != 1)) { + canCollapse = false; + break; + } + } + + size_t nextJitWorkAmount = currentJitWorkAmount * domain[domain.size() - 2]; + if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) { + currentJitWorkAmount = nextJitWorkAmount; + // if we cannot use dim collapsing we should use tile2D + if (!canCollapse) { + if (TileRank < maxTileRank) { + TileRank++; + continue; + } + + break; + } + collapsedDims++; + for (auto &d : inputShapes) + collapseLastDims(d, 1); + for (auto &d : outputShapes) + collapseLastDims(d, 1); + collapseLastDims(domain, 1); + } else { + break; + } + } + return collapsedDims > 0; + }; + return findDimsToCollapse(); +} +ov::PartialShape Snippet::canonicalizeBody() { + auto edgeToBlockedShape = [](const EdgePtr& edge) { + const auto blockedDesc = edge->getMemory().GetDescWithType(); + std::vector dims; + // if blockDim == Shape::UNDEFINED_DIM, then it's a dynamic dimension, and we need to recreate a proper dynamic Dim + for (const auto& d : blockedDesc->getBlockDims()) + dims.emplace_back(d == Shape::UNDEFINED_DIM ? -1 : d); + ngraph::PartialShape shape(dims); + ngraph::AxisVector blocking(blockedDesc->getOrder()); + ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision()); + return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision}; + }; + inputShapeIsBlocked.resize(inputShapes.size(), false); + masterShapeIsBlocked = false; + ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes; + for (size_t i = 0; i < inputShapes.size(); i++) { + auto blockedShape = edgeToBlockedShape(getParentEdgesAtPort(i)[0]); + inputShapeIsBlocked[i] = std::get<0>(blockedShape).size() != std::get<1>(blockedShape).size(); + masterShapeIsBlocked = masterShapeIsBlocked || inputShapeIsBlocked[i]; + input_blocked_shapes.push_back(blockedShape); } + outputShapeIsBlocked.resize(outputShapes.size(), false); + ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes; + for (size_t i = 0; i < outputShapes.size(); i++) { + auto blockedShape = edgeToBlockedShape(getChildEdgesAtPort(i)[0]); + outputShapeIsBlocked[i] = std::get<0>(blockedShape).size() != std::get<1>(blockedShape).size(); + output_blocked_shapes.push_back(blockedShape); + } + + const auto& canonicalShape = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes); + return canonicalShape; +} +void Snippet::createPrimitive() { + // determine canonicalize, determine master_shape and prepend up to 6D + // NB! normInputShapes are updated, so body reshape might be needed + const auto& canonicalShape = canonicalizeBody(); + // initialize by maximum output dimension. Dimensions of outputs should be broadcastable + tensorRank = std::max(static_cast(rank6D), canonicalShape.size()); + + const auto config = getSelectedPrimitiveDescriptor()->getConfig(); + auto initDataSizes = [this, config]() { + const size_t numInputs = inputShapes.size(); + const size_t numOutputs = outputShapes.size(); + dataSize.resize(numInputs + numOutputs); + for (size_t i = 0; i < numInputs; i++) + dataSize[i] = config.inConfs[i].getMemDesc()->getPrecision().size(); + for (size_t i = 0; i < numOutputs; i++) + dataSize[i + numInputs] = config.outConfs[i].getMemDesc()->getPrecision().size(); + }; + initDataSizes(); + + jit_snippets_compile_args jcp; + if (canonicalShape.is_dynamic()) + IE_THROW() << "Snippets: Canonicalization returned dynamic shape in static pipeline"; + masterShape = canonicalShape.get_shape(); + const auto &body = snippet->body_ptr(); + for (const auto& p : body->get_parameters()) + normInputShapes.emplace_back(p->get_output_shape(0)); + for (const auto& r : body->get_results()) + normOutputShapes.emplace_back(r->get_input_shape(0)); + + prepareParams(); + jcp.master_shape = masterShape; + jcp.tile_rank = tileRank; + generate(&jcp); + buffer_scratchpad_size = snippet->get_buffer_scratchpad_size(); + buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0); +} + +std::vector Snippet::shapeInfer() const { + // todo: it's very strange that we don't have broadcast_merge_into for cpu shapes + auto broadcast_merge = [](VectorDims& dst, const VectorDims& src){ + // Ranks are both static. + auto dst_rank = dst.size(); + auto src_rank = src.size(); + const auto new_rank = std::max(dst_rank, src_rank); + dst.insert(dst.begin(), new_rank - dst_rank, 1); + std::vector dims(new_rank); + bool success = true; + for (int64_t i = 0; i < new_rank; i++) { + auto dsti = i < (new_rank - dst_rank) ? 1 : dst[i - (new_rank - dst_rank)]; + auto srci = i < (new_rank - src_rank) ? 1 : src[i - (new_rank - src_rank)]; + if (dsti != srci && srci != Shape::UNDEFINED_DIM) { + if (dsti == 1 || dsti == Shape::UNDEFINED_DIM) { + dsti = srci; + } else { + success = false; + } + } + } + return success; + }; + for (size_t i = 0; i < getParentEdges().size(); i++) { + VectorDims inDims {getParentEdgesAtPort(i)[0]->getMemory().GetShape().getDims()}; + if (masterShapeIsBlocked && !inputShapeIsBlocked[i]) + inDims.insert(inDims.end(), 1); + // todo: this is a simple master_shape inference for shape-agnostic operations, + // we'll need to account for body operations semantics in the future + if (i == 0) + masterShape = inDims; + else + broadcast_merge(masterShape, inDims); + normInputShapes[i] = std::move(inDims); + } + if (std::any_of(masterShape.begin(), masterShape.end(), [](const Dim& d){ return d == Shape::UNDEFINED_DIM;})) { + std::ostringstream errorMessage; + errorMessage << "Can't compute static master shape for Snippet node with name: " << getName(); + errorMessage << ". Input shapes = ( "; + for (size_t i = 0; i < getParentEdges().size(); i++) { + errorMessage << i << " port = " << getParentEdgesAtPort(i)[0]->getMemory().GetShape().toString() << ", "; + } + errorMessage << "). Master shape = ( " << Shape(masterShape).toString() << " )"; + IE_THROW() << errorMessage.str(); + } + + if (normOutputShapes.size() == 1) { + normOutputShapes[0] = masterShape; + return {masterShape}; + } + std::vector outputDims; + std::vector new_shapes; + for (const auto& s : normInputShapes) + new_shapes.emplace_back(s); + const auto& outputShapes = snippet->reshape_body(new_shapes); + for (size_t i = 0; i < outputShapes.size(); i++) + normOutputShapes[i] = outputShapes[i]; + return normOutputShapes; +} + +void Snippet::prepareParams() { + masterShape = getNormalizedDimsBySize(masterShape, tensorRank); + for (auto& pshape : normInputShapes) + pshape = getNormalizedDimsBySize(pshape, tensorRank); + for (auto& pshape : normOutputShapes) + pshape = getNormalizedDimsBySize(pshape, tensorRank); + + tileRank = 1; + fullWorkAmount = std::accumulate(masterShape.begin(), masterShape.end(), 1, std::multiplies()); + if (snippet->has_domain_sensitive_ops()) { + tileRank = 2; + } else { + optimizeExecDomain(normInputShapes, normOutputShapes, masterShape, tileRank); + } + exec_domain = masterShape; + + auto initStartMemoryOffsets = [this]() { + const auto config = getSelectedPrimitiveDescriptor()->getConfig(); + const size_t numInputs = inputShapes.size(); + start_offset_in.resize(numInputs); + srcMemPtrs.resize(numInputs); + for (size_t i = 0; i < numInputs; i++) { + const auto memPtr = getParentEdgeAt(i)->getMemoryPtr(); + srcMemPtrs[i] = memPtr; + start_offset_in[i] = memPtr->GetDescWithType()->getOffsetPadding() * dataSize[i]; + } + const size_t numOutputs = outputShapes.size(); + start_offset_out.resize(numOutputs); + dstMemPtrs.resize(numOutputs); + for (size_t i = 0; i < numOutputs; i++) { + const auto memPtr = getChildEdgeAt(i)->getMemoryPtr(); + dstMemPtrs[i] = memPtr; + start_offset_out[i] = memPtr->GetDescWithType()->getOffsetPadding() * dataSize[i + numInputs]; + } + }; + // initialize start offsets to src and dst memory + // Needs to be done for every set of input shapes sce memory ptrs could've updated + initStartMemoryOffsets(); + std::vector scheduler_work_amounts; + // rename schedulerWorkAmount to harnessWorkAmount? + harnessWorkAmount = fullWorkAmount; + const auto rank = exec_domain.size(); + for (auto i = rank - tileRank; i < rank; i++) { + auto& dim = exec_domain[i]; + harnessWorkAmount /= dim; + scheduler_work_amounts.push_back(dim); + dim = 1; + } + + auto& body_rt_info = snippet->body_ptr()->get_rt_info(); + std::vector> new_shapes(normInputShapes); + std::copy(normOutputShapes.begin(), normOutputShapes.end(), std::back_inserter(new_shapes)); + body_rt_info["PluginShapesOverride"] = new_shapes; + snippet->set_master_shape(ov::PartialShape(masterShape)); + snippet->set_tile_rank(tileRank); +} + +bool Snippet::needPrepareParams() const { + return inputShapesModified() || !schedule.ptr; +} + +bool Snippet::canBeInPlace() const { + if (isDynamic || getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) { + return false; + } if (getChildEdges().size() != 1) { return false; } @@ -263,238 +494,11 @@ bool Snippet::canBeInPlace() const { return getInputShapeAtPort(0) == getOutputShapeAtPort(0); } -static void offset_calculation(std::vector& offset, const std::vector& dims_in, const std::vector& dims_out) { - size_t k = 1; - for (int i = offset.size() - 1; i >= 0; i--) { - offset[i] = (dims_in[i] == dims_out[i]) ? k : 0; - k *= dims_in[i]; - } +bool Snippet::created() const { + return getType() == Type::Subgraph; } -static auto collapseLastDims(std::vector& dims, size_t dimsToCollapse) -> void { - if (dimsToCollapse >= dims.size() - 1) - IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse; - for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { - dims[dims.size() - 1] *= dims[i]; - } - - for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { - dims[i] = dims[i - dimsToCollapse]; - } - - for (int i = dimsToCollapse - 1; i >= 0; i--) { - dims[i] = 1; - } -} - -void Snippet::define_schedule() { - auto edgeToBlockedShape = [](const EdgePtr& edge) { - const auto blockedDesc = edge->getMemory().GetDescWithType(); - ngraph::Shape shape(blockedDesc->getBlockDims()); - ngraph::AxisVector blocking(blockedDesc->getOrder()); - ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision()); - return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision}; - }; - auto prependWithOnes = [this](const std::vector& dims) { - if (tensorRank <= dims.size()) - return dims; - VectorDims result(tensorRank, 1); - std::copy(dims.begin(), dims.end(), &result[tensorRank - dims.size()]); - return result; - }; - ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes; - for (size_t i = 0; i < inputShapes.size(); i++) - input_blocked_shapes.push_back(edgeToBlockedShape(getParentEdgesAtPort(i)[0])); - - ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes; - for (size_t i = 0; i < outputShapes.size(); i++) - output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0])); - - exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes); - - // initialize by maximum output dimension. Dimensions of outputs should be broadcastable - tensorRank = std::max(static_cast(rank6D), exec_domain.size()); - // Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank - // prepend to enable 6D scheduler - exec_domain = prependWithOnes(exec_domain); - const auto &body = snippet->body(); - for (const auto& p : body.get_parameters()) { - dims_in.emplace_back(prependWithOnes(p->get_shape())); - } - - for (size_t i = 0; i < body.get_output_size(); i++) { - dims_out.push_back(prependWithOnes(body.get_output_shape(i))); - } - - const auto config = getSelectedPrimitiveDescriptor()->getConfig(); - auto initOffsets = [this, config]() { - // find max rank input among all outputs - const size_t inputNum = getParentEdges().size(); - offsets_in.resize(inputNum); - for (size_t i = 0; i < inputNum; i++) { - offsets_in[i].resize(tensorRank, 1); - offset_calculation(offsets_in[i], dims_in[i], exec_domain); - for (size_t j = 0; j < tensorRank; j++) { - offsets_in[i][j] *= config.inConfs[i].getMemDesc()->getPrecision().size(); - } - } - - start_offset_in.resize(inputNum); - srcMemPtrs.resize(inputNum); - for (size_t i = 0; i < inputNum; i++) { - const auto memPtr = getParentEdgeAt(i)->getMemoryPtr(); - srcMemPtrs[i] = memPtr; - start_offset_in[i] = memPtr->GetDescWithType()->getOffsetPadding() * - config.inConfs[i].getMemDesc()->getPrecision().size(); - } - - const size_t outputNum = config.outConfs.size(); - offsets_out.resize(outputNum); - for (size_t i = 0; i < outputNum; i++) { - offsets_out[i].resize(tensorRank, 1); - offset_calculation(offsets_out[i], dims_out[i], exec_domain); - for (size_t j = 0; j < tensorRank; j++) { - offsets_out[i][j] *= config.outConfs[i].getMemDesc()->getPrecision().size(); - } - } - - start_offset_out.resize(outputNum); - dstMemPtrs.resize(outputNum); - for (size_t i = 0; i < outputNum; i++) { - const auto memPtr = getChildEdgeAt(i)->getMemoryPtr(); - dstMemPtrs[i] = memPtr; - start_offset_out[i] = memPtr->GetDescWithType()->getOffsetPadding() * - config.outConfs[i].getMemDesc()->getPrecision().size(); - } - }; - - auto find_dims_to_collapse = [this, config]() -> int { - int collapsedDims = 0; - size_t minimalConcurrency = parallel_get_max_threads(); - size_t minimalJitWorkAmount = 256; - size_t currentJitWorkAmount = exec_domain.back(); - while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) { - if (static_cast(exec_domain.size()) - collapsedDims - 2 < 0) - break; - - bool canCollapse = true; - for (size_t i = 0; i < dims_in.size(); i++) { - if ((dims_in[i][dims_in[i].size() - 2] != 1 && dims_in[i][dims_in[i].size() - 1] == 1) || - (dims_in[i][dims_in[i].size() - 2] == 1 && dims_in[i][dims_in[i].size() - 1] != 1)) { - canCollapse = false; - break; - } - } - - size_t nextJitWorkAmount = currentJitWorkAmount * exec_domain[exec_domain.size() - 2]; - if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) { - currentJitWorkAmount = nextJitWorkAmount; - // if we cannot use dim collapsing we should use tile2D - if (!canCollapse) { - if (tileRank < maxTileRank) { - tileRank++; - continue; - } - - break; - } - - collapsedDims++; - for (auto &d : dims_in) - collapseLastDims(d, 1); - - for (auto &d : dims_out) - collapseLastDims(d, 1); - - collapseLastDims(exec_domain, 1); - } else { - break; - } - } - return collapsedDims; - }; - - auto initSchedulingInfo = [this, config]() -> void { - // initialize scheduling information - sch_offsets_in.resize(offsets_in.size(), 0); - sch_offsets_out.resize(offsets_out.size(), 0); - sch_dims.resize(maxTileRank, 1); - sch_dims[maxTileRank-1] = exec_domain.back(); - schedulerWorkAmount = fullWorkAmount / exec_domain.back(); - if (tileRank > 1) { - sch_dims[maxTileRank - tileRank] = exec_domain[tensorRank - 2]; - schedulerWorkAmount /= exec_domain[tensorRank - 2]; - exec_domain[tensorRank - 2] = 1; - - // update offsets for tile 2D because loaders and stores have ptr shifts in some cases - const int64_t vector_size = snippet->get_generator()->get_target_machine()->get_lanes(); - for (size_t i = 0; i < offsets_in.size(); i++) { - const int64_t offset = offsets_in[i][tensorRank - 2]; - const int64_t data_size = config.inConfs[i].getMemDesc()->getPrecision().size(); - if (offset == data_size || offset == vector_size * data_size) { - sch_offsets_in[i] = offset; - } else if ((offset > data_size) || (offset == 0 && dims_in[i].back() != 1 && dims_in[i].back() != vector_size)) { - sch_offsets_in[i] = offset - exec_domain.back() * data_size; - - // If scalar tile executes one time, ptr doesn't move on 1 value - // so we should absolutelly decrease offset - if (exec_domain.back() % vector_size == 1) { - sch_offsets_in[i] += data_size; - } - } - } - - for (size_t i = 0; i < offsets_out.size(); i++) { - const int64_t offset = offsets_out[i][tensorRank - 2]; - const size_t data_size = config.outConfs[i].getMemDesc()->getPrecision().size(); - if (offset == data_size || offset == vector_size * data_size) { - sch_offsets_out[i] = offset; - } else if ((offset > data_size) || (offset == 0 && dims_out[i].back() != 1 && dims_out[i].back() != vector_size)) { - sch_offsets_out[i] = offset - exec_domain.back() * data_size; - - // If scalar tile executes one time, ptr doesn't move on 1 value - // so we should absolutelly decrease offset - if (exec_domain.back() % vector_size == 1) { - sch_offsets_out[i] += data_size; - } - } - } - } - }; - - fullWorkAmount = 1; - for (const auto &d : exec_domain) { - fullWorkAmount *= d; - } - - batchDimIdx = tensorRank - exec_domain.size(); - // Note that exec_domain can be modified inside find_dims_to_collapse() and/or initSchedulingInfo() - find_dims_to_collapse(); - - initOffsets(); - initSchedulingInfo(); -} - -void Snippet::generate() { - jit_snippets_compile_args jcp; - jcp.output_dims = exec_domain; - std::copy(sch_dims.begin(), sch_dims.end(), jcp.scheduler_dims); - std::copy(sch_offsets_in.begin(), sch_offsets_in.end(), jcp.scheduler_offsets); - std::copy(sch_offsets_out.begin(), sch_offsets_out.end(), &jcp.scheduler_offsets[sch_offsets_in.size()]); - size_t harness_num_dims = jcp.output_dims.size() - 1; - if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS) { - canUseOptimizedImpl = false; - harness_num_dims = SNIPPETS_MAX_HARNESS_DIMS; - } - for (size_t i = 0; i < inputShapes.size(); i++) { - auto b = offsets_in[i].begin(); - std::copy(b, b + harness_num_dims, &jcp.data_offsets[i * harness_num_dims]); - } - for (size_t i = 0; i < outputShapes.size(); i++) { - auto b = offsets_out[i].begin(); - std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]); - } - +void Snippet::generate(const jit_snippets_compile_args* jcp) { ov::pass::Manager optManager; optManager.register_pass(); optManager.register_pass(); @@ -515,25 +519,54 @@ void Snippet::generate() { return convert->get_input_element_type(0) != ov::element::f32; return true; }); - - schedule = snippet->generate(optManager, reinterpret_cast(&jcp)); + schedule = snippet->generate(optManager, reinterpret_cast(jcp)); } -void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const { +void Snippet::update_ptrs(jit_snippets_call_args& call_args) { + for (size_t i = 0; i < srcMemPtrs.size(); i++) + call_args.src_ptrs[i] = reinterpret_cast(srcMemPtrs[i]->GetData()) + start_offset_in[i]; + + for (size_t i = 0; i < dstMemPtrs.size(); i++) + call_args.dst_ptrs[i] = reinterpret_cast(dstMemPtrs[i]->GetData()) + start_offset_out[i]; + + if (buffer_scratchpad_size > 0) { + call_args.buffer_scratchpad_ptr = + reinterpret_cast(buffer_scratchpad.data()) + parallel_get_thread_num() * buffer_scratchpad_size; + } +} + +void Snippet::execute(dnnl::stream strm) { + if (schedule.ptr == nullptr) { + IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference"; + } + if (tensorRank == rank6D) { + schedule_6d(); + } else { + schedule_nt(); + } +} + +void Snippet::schedule_6d() { const auto& dom = exec_domain; // < N, C, H, W > < 1, 1, N, C*H*W> parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4], [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { int64_t indexes[] = {d0, d1, d2, d3, d4}; + jit_snippets_call_args call_args; + update_ptrs(call_args); + schedule.get_callable()(indexes, &call_args); }); } -void Snippet::schedule_nt(const jit_snippets_call_args& call_args) const { +void Snippet::schedule_nt() { const auto& work_size = exec_domain; parallel_nt(0, [&](const int ithr, const int nthr) { + jit_snippets_call_args call_args; + update_ptrs(call_args); + size_t start = 0, end = 0; - splitter(schedulerWorkAmount, nthr, ithr, start, end); + splitter(harnessWorkAmount, nthr, ithr, start, end); std::vector indexes(work_size.size() - 1, 0); for (size_t iwork = start; iwork < end; ++iwork) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 9b9b06036fb..ce9a01f951b 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -38,6 +38,9 @@ public: // Here we convert to canonical for & jit everything void createPrimitive() override; + void prepareParams() override; + std::vector shapeInfer() const override; + bool needPrepareParams() const override; bool canBeInPlace() const override; bool created() const override; @@ -55,13 +58,15 @@ private: // NOTE: Before call mutex should be initialized void copy_snippet(); - void define_schedule(); - - void generate(); + ov::PartialShape canonicalizeBody(); + // returns true if exec domain was modified + bool optimizeExecDomain(std::vector&, std::vector&, VectorDims&, size_t&) const; + void generate(const jit_snippets_compile_args*); + inline void update_ptrs(jit_snippets_call_args&); // Evaluates generated snippet using parallel backend - void schedule_6d(const jit_snippets_call_args& const_args) const; - void schedule_nt(const jit_snippets_call_args& const_args) const; + void schedule_6d(); + void schedule_nt(); // Original subgraph node std::shared_ptr original_snippet; @@ -73,34 +78,39 @@ private: // Holds ISA version used is codeGeneration target dnnl::impl::cpu::x64::cpu_isa_t host_isa; + size_t isa_num_lanes; // number of elements that fit in vector size // Holds index of output used as in execution domain // it should be compatible with a schedule's work size std::vector exec_domain = {}; /// scheduling info - size_t batchDimIdx = 0; size_t tensorRank = 0; size_t tileRank = 1; size_t fullWorkAmount = 0; - size_t schedulerWorkAmount = 0; + size_t harnessWorkAmount = 0; const size_t maxTileRank = 2; std::vector srcMemPtrs = {}; std::vector dstMemPtrs = {}; + std::vector dataSize = {}; + + // this is needed for fast shape inference of blocking-invariant prepended shapes + std::vector inputShapeIsBlocked = {}; // we need this info to shape-infer mixed layouts + std::vector outputShapeIsBlocked = {}; // we need this info to shape-infer mixed layouts + bool masterShapeIsBlocked = false; + + // master shape is mutable since we need to modify it inside const shapeInfer method + mutable VectorDims masterShape = {}; + mutable std::vector normInputShapes = {}; + mutable std::vector normOutputShapes = {}; - std::vector> dims_in = {}; - std::vector> offsets_in = {}; std::vector start_offset_in = {}; std::vector start_offset_out = {}; - std::vector> dims_out = {}; - std::vector> offsets_out = {}; - - std::vector sch_dims = {}; - std::vector sch_offsets_in = {}; - std::vector sch_offsets_out = {}; - bool canUseOptimizedImpl = true; + // Buffer scratchpad + std::vector buffer_scratchpad = {}; + size_t buffer_scratchpad_size = 0; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 04e6e8c23e8..871cc3a5381 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -396,12 +396,24 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std const auto& dynamicBatchProp = config.find(InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED); const bool enableDynamicBatch = (dynamicBatchProp != config.end() && dynamicBatchProp->second == PluginConfigParams::YES) || engConfig.enableDynamicBatch; - const bool enableSnippets = !enableDynamicBatch; + + auto snippetsMode = enableDynamicBatch ? Config::SnippetsMode::Disable : Config::SnippetsMode::Enable; + const auto& snippetsModeProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE); + if (snippetsMode == Config::SnippetsMode::Enable && snippetsModeProp != config.end()) { + const auto& val = snippetsModeProp->second; + if (val == PluginConfigInternalParams::IGNORE_CALLBACK) + snippetsMode = Config::SnippetsMode::IgnoreCallback; + else if (val == PluginConfigInternalParams::DISABLE) + snippetsMode = Config::SnippetsMode::Disable; + else + IE_THROW() << "Wrong value for property key SNIPPETS_MODE. Expected values: ENABLE/DISABLE/IGNORE_CALLBACK"; + } + auto nGraphFunc = clonedNetwork.getFunction(); DEBUG_LOG(PrintableModel(*nGraphFunc, "org_")); - Transformations transformations(nGraphFunc, enableLPT, enableSnippets, enableBF16, isLegacyAPI(), engConfig); + Transformations transformations(nGraphFunc, enableLPT, enableBF16, isLegacyAPI(), snippetsMode, engConfig); transformations.UpToCpuSpecificOpSet(); // need to check that all outputs have static shapes @@ -645,7 +657,18 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma const auto& lptProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_LP_TRANSFORMS_MODE); const bool enableLPT = (lptProp != config.end() && lptProp->second == PluginConfigParams::YES) /* enabled in the orig_config*/ || Config::LPTransformsMode::On == engConfig.lpTransformsMode /* or already enabled */; - const bool enableSnippets = !conf.enableDynamicBatch; + + auto snippetsMode = conf.enableDynamicBatch ? Config::SnippetsMode::Disable : Config::SnippetsMode::Enable; + const auto& snippetsModeProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE); + if (snippetsMode == Config::SnippetsMode::Enable && snippetsModeProp != config.end()) { + const auto& val = snippetsModeProp->second; + if (val == PluginConfigInternalParams::IGNORE_CALLBACK) + snippetsMode = Config::SnippetsMode::IgnoreCallback; + else if (val == PluginConfigInternalParams::DISABLE) + snippetsMode = Config::SnippetsMode::Disable; + else + IE_THROW() << "Wrong value for property key SNIPPETS_MODE. Expected values: ENABLE/DISABLE/IGNORE_CALLBACK"; + } auto model = network.getFunction(); if (model == nullptr) { @@ -657,7 +680,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma auto supported = GetSupportedNodes(model, [&](std::shared_ptr& model) { - Transformations transformation(model, enableLPT, enableSnippets, conf.enforceBF16, isLegacyAPI(), engConfig); + Transformations transformation(model, enableLPT, conf.enforceBF16, isLegacyAPI(), snippetsMode, engConfig); transformation.UpToCpuSpecificOpSet(); transformation.CpuSpecificOpSet(); }, diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp index af29d870b0b..b47fcfe73da 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp @@ -42,12 +42,12 @@ ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() { std::dynamic_pointer_cast(convert)) { load_convert = std::make_shared(param, convert_saturation->get_destination_type(), - load->get_count()); + load->get_count(), load->get_offset()); } else if (const auto convert_truncation = std::dynamic_pointer_cast(convert)) { load_convert = std::make_shared(param, convert_truncation->get_destination_type(), - load->get_count()); + load->get_count(), load->get_offset()); } else { throw ngraph::ngraph_error( "Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops"); @@ -91,12 +91,12 @@ ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() { std::dynamic_pointer_cast(convert)) { store_convert = std::make_shared(input, convert_saturation->get_destination_type(), - store->get_count()); + store->get_count(), store->get_offset()); } else if (const auto convert_truncation = std::dynamic_pointer_cast(convert)) { store_convert = std::make_shared(input, convert_truncation->get_destination_type(), - store->get_count()); + store->get_count(), store->get_offset()); } else { throw ngraph::ngraph_error( "Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops"); diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp index fcf59b169ef..dbb8046f636 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp @@ -11,8 +11,9 @@ using namespace std; using namespace ov; -intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Load(x, count), m_destination_type(destination_type) { +intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Load(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -30,11 +31,12 @@ void intel_cpu::LoadConvertSaturation::validate_and_infer_types() { std::shared_ptr intel_cpu::LoadConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } -intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Load(x, count), m_destination_type(destination_type) { +intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Load(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -52,5 +54,5 @@ void intel_cpu::LoadConvertTruncation::validate_and_infer_types() { std::shared_ptr intel_cpu::LoadConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp index 50d7fe44c2e..9931a6f057d 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp @@ -20,7 +20,7 @@ class LoadConvertSaturation : public ngraph::snippets::op::Load { public: OPENVINO_OP("LoadConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Load); - LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + LoadConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); LoadConvertSaturation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } @@ -47,7 +47,7 @@ class LoadConvertTruncation : public ngraph::snippets::op::Load { public: OPENVINO_OP("LoadConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Load); - LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + LoadConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); LoadConvertTruncation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp index 5362ff9c9b6..52921e681e9 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp @@ -11,8 +11,9 @@ using namespace std; using namespace ov; -intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Store(x, count), m_destination_type(destination_type) { +intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Store(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -30,11 +31,12 @@ void intel_cpu::StoreConvertSaturation::validate_and_infer_types() { std::shared_ptr intel_cpu::StoreConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } -intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count) : - Store(x, count), m_destination_type(destination_type) { +intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, + const size_t count, const size_t offset) : + Store(x, count, offset), m_destination_type(destination_type) { constructor_validate_and_infer_types(); } @@ -52,5 +54,5 @@ void intel_cpu::StoreConvertTruncation::validate_and_infer_types() { std::shared_ptr intel_cpu::StoreConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_destination_type, m_count); + return std::make_shared(new_args.at(0), m_destination_type, m_count, m_offset); } diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp index d44ccacd4f4..ee6410682b8 100644 --- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp +++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp @@ -20,7 +20,7 @@ class StoreConvertSaturation : public ngraph::snippets::op::Store { public: OPENVINO_OP("StoreConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Store); - StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + StoreConvertSaturation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); StoreConvertSaturation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } @@ -47,7 +47,7 @@ class StoreConvertTruncation : public ngraph::snippets::op::Store { public: OPENVINO_OP("StoreConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Store); - StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu); + StoreConvertTruncation(const Output& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu); StoreConvertTruncation() = default; ov::element::Type get_destination_type() const { return m_destination_type; } diff --git a/src/plugins/intel_cpu/src/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformation_pipeline.cpp index b4e0de4ff3e..293e6fc500f 100644 --- a/src/plugins/intel_cpu/src/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformation_pipeline.cpp @@ -95,7 +95,7 @@ #include "ngraph_transformations/swap_convert_transpose.hpp" // Snippets -#include "snippets/pass/collapse_subgraph.hpp" +#include "snippets/pass/tokenization.hpp" #include "snippets/pass/common_optimizations.hpp" // Misc @@ -140,7 +140,7 @@ void Transformations::UpToCpuSpecificOpSet() { ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(model) && CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Lpt); - const bool useSnippets = enableSnippets && + const bool useSnippets = snippetsMode != Config::SnippetsMode::Disable && CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Snippets); auto defaultPrecisions = useLpt ? ngraph::pass::low_precision::precision_set::int8_support : std::vector{}; @@ -543,47 +543,95 @@ void Transformations::PostLpt() { return false; }); + // Float MHA is supported by snippets now + if (!enableBF16) { + postLPTPassManager.get_pass_config()->disable(); + postLPTPassManager.get_pass_config()->disable(); + } + // Execute before snippets. Otherwise FQ will be converted to Subgraph postLPTPassManager.register_pass(); postLPTPassManager.run_passes(model); } void Transformations::MainSnippets(void) { - if (!enableSnippets || + if (snippetsMode == Config::SnippetsMode::Disable || !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) // snippets are implemeted only for relevant platforms (avx2+ extentions) return; - ov::pass::Manager snippetsManager; - snippetsManager.register_pass(); - snippetsManager.register_pass(); - snippetsManager.register_pass(); - snippetsManager.get_pass_config()->set_callback( - [](const std::shared_ptr& n) -> bool { - // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant - if (ov::is_type(n)) { - if (n->inputs().size() > 1 && !ov::is_type(n->get_input_node_shared_ptr(1))) - return true; - } + ngraph::pass::Manager snippetsManager; + if (snippetsMode != Config::SnippetsMode::IgnoreCallback) + snippetsManager.register_pass(); + snippetsManager.register_pass(); - const auto& inputs = n->inputs(); - // todo: clarify whether we can evaluate snippets on const paths - const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(), - [](const ov::Input &in) { - return ov::is_type(in.get_source_output().get_node_shared_ptr()); - }); - // todo: clarify whether we can evaluate snippets on inputs with larger ranks - auto rank_is_too_large = [](const ov::descriptor::Tensor& t ) { - // callback is called has_supported_in_out(), so it's safe to assume that the shapes are static - return t.get_partial_shape().rank().get_length() > 6; - }; - const bool bad_input_rank = std::any_of(inputs.begin(), inputs.end(), - [&](const ov::Input& in) {return rank_is_too_large(in.get_tensor());}); - const auto& outputs = n->outputs(); - const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(), - [&](const ov::Output& out) {return rank_is_too_large(out.get_tensor());}); - return has_only_const_inputs || bad_input_rank || bad_output_rank; - }); - snippetsManager.register_pass(); + const bool isMHASupported = + !enableBF16 && // TODO: Need to add BF16 support for MHA in Snippets + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core); // MHA has BRGEMM that is supported only on AVX512 platforms + if (!isMHASupported) { + snippetsManager.get_pass_config()->disable(); + } + if (snippetsMode != Config::SnippetsMode::IgnoreCallback) { + snippetsManager.get_pass_config()->set_callback( + [](const std::shared_ptr& n) -> bool { + const auto pshape = n->get_output_partial_shape(0); + const auto shape = pshape.get_shape(); + const auto parallel_work_amount = + std::accumulate(shape.rbegin() + 2, shape.rend(), 1, std::multiplies()); + const auto kernel_buffer_size = + std::accumulate(shape.rbegin(), shape.rbegin() + 2, 1, std::multiplies()) * + n->get_output_element_type(0).size(); + // Heuristic values: + // parallelism work amount - not enough work amount for parallelism + // kernel work amount - large shape for kernel execution, not cache-local + // TODO: The heuristics will be removed after + // - loop blocking support on code generation level + // - parallelism support on JIT level + const auto needed_num_of_threads = 12lu; + const auto l2_cache_size = dnnl::utils::get_cache_size(2, true); + const auto is_unsupported_parallel_work_amount = parallel_get_num_threads() / 2 > parallel_work_amount && + parallel_work_amount < needed_num_of_threads; + const auto is_unsupported_kernel_work_amount = kernel_buffer_size > l2_cache_size; + return is_unsupported_parallel_work_amount || is_unsupported_kernel_work_amount; + }); + snippetsManager.get_pass_config()->set_callback( + [](const std::shared_ptr& n) -> bool { + // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant + const bool is_unsupported_swish = + ov::is_type(n) && n->inputs().size() > 1 && + !ov::is_type(n->get_input_node_shared_ptr(1)); + // todo: general tokenization flow is not currently supported for these operations. + // they can be tokenized only as a part of complex patterns + const bool is_disabled_tokenization = (ov::is_type(n) || + ov::is_type(n) || + ov::is_type(n) || + ov::is_type(n) || + ov::is_type(n) || + ov::is_type(n)); + const auto& inputs = n->inputs(); + // todo: clarify whether we can evaluate snippets on const paths + const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(), + [](const ov::Input& in) { + return ov::is_type( + in.get_source_output().get_node_shared_ptr()); + }); + // todo: clarify whether we can evaluate snippets on inputs with larger ranks + auto rank_is_too_large = [](const ov::descriptor::Tensor& t) { + // callback is called has_supported_in_out(), so it's safe to assume that the shapes are static + return t.get_partial_shape().rank().get_length() > 6; + }; + const bool bad_input_rank = std::any_of(inputs.begin(), inputs.end(), + [&](const ov::Input& in) { + return rank_is_too_large(in.get_tensor()); + }); + const auto& outputs = n->outputs(); + const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(), + [&](const ov::Output& out) { + return rank_is_too_large(out.get_tensor()); + }); + return has_only_const_inputs || bad_input_rank || bad_output_rank || is_unsupported_swish || + is_disabled_tokenization; + }); + } snippetsManager.run_passes(model); } diff --git a/src/plugins/intel_cpu/src/transformation_pipeline.h b/src/plugins/intel_cpu/src/transformation_pipeline.h index f56427afa8d..e01ad93f0fd 100644 --- a/src/plugins/intel_cpu/src/transformation_pipeline.h +++ b/src/plugins/intel_cpu/src/transformation_pipeline.h @@ -24,15 +24,15 @@ class Transformations { public: Transformations(const std::shared_ptr& initialModel, const bool enableLpt, - const bool enableSnippets, const bool enableBF16, const bool isLegacyApi, + Config::SnippetsMode& snippetsMode, const Config& config) : model(initialModel), enableLpt(enableLpt), - enableSnippets(enableSnippets), enableBF16(enableBF16), isLegacyApi(isLegacyApi), + snippetsMode(snippetsMode), config(config) {} void UpToCpuSpecificOpSet(); @@ -41,10 +41,10 @@ public: private: std::shared_ptr model; const bool enableLpt; - const bool enableSnippets; const bool enableBF16; const bool isLegacyApi; const Config& config; + const Config::SnippetsMode snippetsMode; void PreLpt(const std::vector& defaultPrecisions, const bool isLegacyApi); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index cf4ff70fbe9..fd697eedd77 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -215,6 +215,10 @@ std::vector disabledTestPatterns() { // tests are useless on such platforms retVector.emplace_back(R"(.*(BF|bf)16.*)"); retVector.emplace_back(R"(.*bfloat16.*)"); + // MatMul in Snippets uses BRGEMM that is supported only on AVX512 platforms + // Disabled Snippets MHA tests as well because MHA pattern contains MatMul + retVector.emplace_back(R"(.*Snippets.*MHA.*)"); + retVector.emplace_back(R"(.*Snippets.*(MatMul|Matmul).*)"); } if (!InferenceEngine::with_cpu_x86_avx512_core_amx_int8()) //TODO: Issue 92895 diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp index acebcb77d6e..86b7d6b3b11 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp @@ -12,15 +12,49 @@ namespace snippets { namespace { +namespace snippets_static_1 { +// These inputs are needed to test static Loop optimizations (emit the whole tile, body with increments, set WA etc) +std::vector inShapesStatic1{{1, 16, 29, 1}, {1, 16, 29, 7}, {1, 16, 29, 8}, {1, 16, 29, 15}, {1, 16, 29, 16}, {1, 16, 29, 31}}; +std::vector inShapesStatic2{{1, 16, 29, 1}, {1, 16, 1, 1}, {1, 1, 1, 1}}; + INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add, + ::testing::Combine( + ::testing::ValuesIn(inShapesStatic1), + ::testing::ValuesIn(inShapesStatic2), + ::testing::Values(ov::element::f32), + ::testing::Values(1), // Add + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Add::getTestCaseName); +// test cross-tile (vector vs scalar) optimizations in the absence of vector tile +std::vector> inShapesStatic{ + {{1, 128, 1, 1}, {1, 128, 1, 1}}, + {{1, 128, 1, 9}, {1, 128, 1, 9}}, + {{1, 128, 1, 17}, {1, 128, 1, 17}}, + {{1, 128, 1, 29}, {1, 128, 1, 29}}, + {{1, 128, 1, 33}, {1, 128, 1, 33}}, + {{1, 128, 9, 30}, {1, 128, 1, 30}}, + {{1, 128, 9, 1}, {1, 128, 1, 30}}, +}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddPair, + ::testing::Combine( + ::testing::ValuesIn(inShapesStatic), + ::testing::Values(ov::element::f32), + ::testing::Values(1), + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddPair::getTestCaseName); + +} // namespace snippets_static_1 + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddConst, ::testing::Combine( ::testing::Values(ov::Shape {1, 42, 16, 64}), - ::testing::Values(ov::Shape {1, 42, 16, 1}), ::testing::Values(ov::element::f32), - ::testing::Values(1), - ::testing::Values(1), // one node - Add + ::testing::Values(1), // Add + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts ::testing::Values(CommonTestUtils::DEVICE_CPU)), - Add::getTestCaseName); + AddConst::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddRollConst, ::testing::Combine( diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp index bdf0fd38a50..ffc6ef57add 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp @@ -10,26 +10,26 @@ namespace test { namespace snippets { namespace { - ov::Shape convInputShape {1, 10, 16, 16}; - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvAdd, ConvEltwise, - ::testing::Combine( - ::testing::Values(convInputShape), - ::testing::Values(convInputShape), - ::testing::Values(std::shared_ptr (std::make_shared())), // non-tokenizable - ::testing::Values(6), // num nodes = 6: Convert + Convolution + 4 Reorders on Convs in&outs - ::testing::Values(0), // num subgraphs = 0: No subgraph since all ops eltwises fused into Convolution - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ConvEltwise::getTestCaseName); +ov::Shape convInputShape {1, 10, 16, 16}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvAdd, ConvEltwise, + ::testing::Combine( + ::testing::Values(convInputShape), + ::testing::Values(convInputShape), + ::testing::Values(std::shared_ptr (std::make_shared())), // non-tokenizable + ::testing::Values(6), // num nodes = 6: Convert + Convolution + 4 Reorders on Convs in&outs + ::testing::Values(0), // num subgraphs = 0: No subgraph since all ops eltwises fused into Convolution + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ConvEltwise::getTestCaseName); - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvMul, ConvEltwise, - ::testing::Combine( - ::testing::Values(convInputShape), - ::testing::Values(convInputShape), - ::testing::Values(std::shared_ptr (std::make_shared())), // fully-tokenizable - ::testing::Values(7), //num nodes = 7: Convert + Convolution + Subgraph + Reorders - ::testing::Values(1), // num subgraphs = 0: Mul (2 inputs) can't be fused into Conv => Subgraph is created - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ConvEltwise::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvMul, ConvEltwise, + ::testing::Combine( + ::testing::Values(convInputShape), + ::testing::Values(convInputShape), + ::testing::Values(std::shared_ptr (std::make_shared())), // fully-tokenizable + ::testing::Values(7), //num nodes = 7: Convert + Convolution + Subgraph + Reorders + ::testing::Values(1), // num subgraphs = 1: Mul (2 inputs) can't be fused into Conv => Subgraph is created + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ConvEltwise::getTestCaseName); } // namespace } // namespace snippets } // namespace test diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp index e2890469356..b3d2907e7ee 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp @@ -30,10 +30,10 @@ const std::vector, std::vector> inputShapes_Convert = { - { ov::Shape{2, 16} }, - { ov::Shape{5, 5} }, - { ov::Shape{2, 12, 1} } +const std::vector> inputShapes_Convert = { + { ov::PartialShape{2, 16} }, + { ov::PartialShape{5, 5} }, + { ov::PartialShape{2, 12, 1} } }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert, @@ -57,10 +57,10 @@ const std::vector, std::vector> inputShapes_ConvertInput = { - { ov::Shape{2, 16}, ov::Shape{1, 16} }, - { ov::Shape{5, 18}, ov::Shape{5, 1} }, - { ov::Shape{3, 1}, ov::Shape{3, 21} } +const std::vector> inputShapes_ConvertInput = { + { ov::PartialShape{2, 16}, ov::PartialShape{1, 16} }, + { ov::PartialShape{5, 18}, ov::PartialShape{5, 1} }, + { ov::PartialShape{3, 1}, ov::PartialShape{3, 21} } }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertInput, ConvertInput, @@ -94,10 +94,10 @@ const std::vector, std::vector> inputShapes_ConvertPartialInputsAndResults = { - { ov::Shape{2, 16}, ov::Shape{1, 16}, ov::Shape{1, 1} }, - { ov::Shape{5, 18}, ov::Shape{5, 1}, ov::Shape{1, 18} }, - { ov::Shape{3, 1}, ov::Shape{3, 21}, ov::Shape{3, 1} } +const std::vector> inputShapes_ConvertPartialInputsAndResults = { + { ov::PartialShape{2, 16}, ov::PartialShape{1, 16}, ov::PartialShape{1, 1} }, + { ov::PartialShape{5, 18}, ov::PartialShape{5, 1}, ov::PartialShape{1, 18} }, + { ov::PartialShape{3, 1}, ov::PartialShape{3, 21}, ov::PartialShape{3, 1} } }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertPartialInputsAndResults, @@ -117,7 +117,7 @@ const std::vector, std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::Values(std::vector{{5, 5, 5, 5}}), ::testing::ValuesIn(types_ConvertMany), ::testing::Values(1), ::testing::Values(1), @@ -126,7 +126,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutputs, ::testing::Combine( - ::testing::Values(std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::Values(std::vector{{5, 5, 5, 5}}), ::testing::ValuesIn(types_ConvertMany), ::testing::Values(1), ::testing::Values(1), @@ -140,7 +140,7 @@ const std::vector, std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::Values(std::vector{{5, 5, 5, 5}}), ::testing::ValuesIn(types_ConvertManyIO), ::testing::Values(1), ::testing::Values(1), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp new file mode 100644 index 00000000000..9ab22c79d2e --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/matmul.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { +std::vector> input_shapes{ + {{2, 1, 3, 5}, {1, 3, 5, 3}}, + {{3, 1, 32, 14}, {1, 2, 14, 32}}, + {{1, 2, 37, 23}, {2, 1, 23, 37}}, + {{1, 1, 37, 23}, {1, 2, 23, 33}}, + {{2, 1, 69, 43}, {1, 1, 43, 49}} +}; +std::vector precisions{element::f32}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul, + ::testing::Combine( + ::testing::ValuesIn(input_shapes), + ::testing::ValuesIn(precisions), + ::testing::Values(1), // MatMu; + ::testing::Values(1), // Tokenized MatMul + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MatMul::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBias, MatMulBias, + ::testing::Combine( + ::testing::Values(std::vector{{1, 2, 69, 43}, {2, 1, 43, 49}, {1, 1, 69, 49}}), + ::testing::ValuesIn(precisions), + ::testing::Values(1), // Subgraph; + ::testing::Values(1), // Tokenized MatMul+Bias + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MatMul::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ExplicitTransposeMatMul, ExplicitTransposeMatMul, + ::testing::Combine( + ::testing::Values(std::vector{{1, 2, 69, 43}, {2, 49, 2, 43}}), + ::testing::ValuesIn(precisions), + ::testing::Values(1), // Subgraph; + ::testing::Values(1), // Tokenized MatMul+Bias + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ExplicitTransposeMatMul::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulBias, ExplicitTransposeMatMulBias, + ::testing::Combine( + ::testing::Values(std::vector{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}}), + ::testing::ValuesIn(precisions), + ::testing::Values(1), // Subgraph; + ::testing::Values(1), // Tokenized MatMul+Bias + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MatMul::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMulMatMulBias, ExplicitTransposeMulMatMulBias, + ::testing::Combine( + ::testing::Values(std::vector{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 2, 1, 1}, {1, 1, 69, 49}}), + ::testing::ValuesIn(precisions), + ::testing::Values(1), // Subgraph; + ::testing::Values(1), // Tokenized MatMul+Bias + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MatMul::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp new file mode 100644 index 00000000000..11aeaebdcc2 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -0,0 +1,67 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +const std::vector> inputShapes = { + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, + {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}}, + {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}}, + {{2, 68, 6, 92}, {2, 68, 6, 92}, {1, 1, 68, 68}, {2, 68, 6, 92}}, + {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHA, + ::testing::Combine( + ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn({false, true}), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MHA::getTestCaseName); + +const std::vector> inputShapeSelect = { + // without broadcast + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 128, 12, 64}}, + {{1, 94, 12, 54}, {1, 94, 12, 54}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 94, 12, 54}}, + // with broadcast + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 1, 1}, {1, 12, 1, 1}, {1, 128, 12, 64}}, + {{2, 52, 6, 102}, {2, 52, 6, 102}, {1, 6, 52, 52}, {1, 6, 1, 1}, {1, 6, 1, 1}, {2, 52, 6, 102}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHASelect, + ::testing::Combine( + ::testing::ValuesIn(inputShapeSelect), + ::testing::Values(false), // Need to support True for graph builder in tests + ::testing::Values(2), // Less + MHA + ::testing::Values(2), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MHA::getTestCaseName); + +const std::vector> inputShapesWOTranspose = { + {{1, 12, 197, 64}, {1, 12, 64, 197}, {1, 12, 197, 64}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeOnInputs, MHAWOTransposeOnInputs, + ::testing::Combine( + ::testing::ValuesIn(inputShapesWOTranspose), + ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MHA::getTestCaseName); + + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/select.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/select.cpp new file mode 100644 index 00000000000..677d7678af0 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/select.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/select.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Select, Select, + ::testing::Combine( + ::testing::ValuesIn({ov::Shape{1, 5, 5, 35}, ov::Shape{1}}), + ::testing::ValuesIn({ov::Shape{1, 5, 5, 35}, ov::Shape{1}}), + ::testing::ValuesIn({ov::Shape{1, 5, 5, 35}, ov::Shape{1}}), + ::testing::ValuesIn({ov::element::f32, ov::element::i8}), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Select::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastSelect, BroadcastSelect, + ::testing::Combine( + ::testing::ValuesIn({Shape{1, 8, 2, 1}, Shape{1, 1, 1, 1}}), + ::testing::ValuesIn({Shape{1, 8, 2, 10}, Shape{1, 8, 2, 1}}), + ::testing::ValuesIn({Shape{1, 8, 2, 10}, Shape{1, 1, 1, 1}}), + ::testing::ValuesIn({Shape{1, 8, 2, 1}, Shape{1, 8, 2, 10}}), + ::testing::ValuesIn({ov::element::f32, ov::element::i8}), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + BroadcastSelect::getTestCaseName); + + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp new file mode 100644 index 00000000000..8869ffdb6ee --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/softmax.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +const std::vector inputShape = { + ov::Shape{1, 16}, + ov::Shape{1, 32}, + ov::Shape{1, 1}, + ov::Shape{1, 9}, + ov::Shape{1, 17}, + ov::Shape{1, 19}, + ov::Shape{1, 49}, + ov::Shape{1, 50}, + ov::Shape{5, 16}, + ov::Shape{5, 32}, + ov::Shape{5, 1}, + ov::Shape{5, 9}, + ov::Shape{5, 17}, + ov::Shape{5, 19}, + ov::Shape{5, 49}, + ov::Shape{5, 50}, + ov::Shape{1, 3, 128, 128}, + ov::Shape{1, 3, 128, 129}, + ov::Shape{1, 3, 128, 130}, + ov::Shape{1, 3, 128, 1}, + ov::Shape{1, 3, 128, 9}, + ov::Shape{1, 3, 128, 16}, + ov::Shape{1, 3, 128, 17}, + ov::Shape{1, 3, 128, 20}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Softmax, Softmax, + ::testing::Combine( + ::testing::ValuesIn(inputShape), + ::testing::Values(-1), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Softmax::getTestCaseName); + +const std::vector> inputShapesPair = { + std::pair{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 16, 35}}, + std::pair{ov::Shape{1, 5, 16, 1}, ov::Shape{1, 5, 16, 35}}, + std::pair{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 1, 1}}, + std::pair{ov::Shape{1, 5, 16, 1}, ov::Shape{1, 5, 16, 1}}, + std::pair{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 1, 35}}, + std::pair{ov::Shape{1, 5, 1, 35}, ov::Shape{1, 5, 1, 35}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmax, AddSoftmax, + ::testing::Combine( + ::testing::ValuesIn(inputShapesPair), + ::testing::Values(-1), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddSoftmax::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp new file mode 100644 index 00000000000..0179adb0a7a --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/transpose.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { +std::vector input_shapes{{2, 3, 5, 13}, {2, 3, 2, 4}, {1, 7, 1, 4}}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Transpose, Transpose, + ::testing::Combine( + ::testing::ValuesIn(input_shapes), + ::testing::Values(std::vector {0, 2, 3, 1}), + ::testing::Values(1), // Transpose + ::testing::Values(1), // Tokenized Transpose + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + Transpose::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp new file mode 100644 index 00000000000..8e3af45fd52 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/transpose_matmul.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { +std::vector precisions{element::f32}; +namespace transpose_zero_input { +std::vector> transpose_input_shapes{ + {{1, 49, 2, 23}, {2, 2, 23, 39}} +}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, + ::testing::Combine( + ::testing::ValuesIn(transpose_input_shapes), + ::testing::Values(0), // Transpose on 0th Matmul input + ::testing::ValuesIn(precisions), + ::testing::Values(1), // MatMul; + ::testing::Values(1), // Tokenized MatMul + FusedTranspose + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TransposeMatMul::getTestCaseName); +} // namespace transpose_zero_input + +namespace transpose_first_input { +std::vector> transpose_input_shapes{ + {{2, 1, 49, 13}, {1, 13, 3, 39}} +}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, + ::testing::Combine( + ::testing::ValuesIn(transpose_input_shapes), + ::testing::Values(1), // Transpose on 1st Matmul input + ::testing::ValuesIn(precisions), + ::testing::Values(1), // MatMu; + ::testing::Values(1), // Tokenized MatMul + FusedTranspose + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TransposeMatMul::getTestCaseName); +} // namespace transpose_first_input + +namespace transpose_output { +std::vector> transpose_input_shapes{ + {{2, 1, 49, 13}, {1, 2, 13, 39}} +}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, + ::testing::Combine( + ::testing::ValuesIn(transpose_input_shapes), + ::testing::Values(2), // Transpose on Matmul output + ::testing::ValuesIn(precisions), + ::testing::Values(1), // MatMu; + ::testing::Values(1), // Tokenized MatMul + FusedTranspose + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TransposeMatMul::getTestCaseName); +} // namespace transpose_output + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp new file mode 100644 index 00000000000..1158dff31c3 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/transpose_softmax.hpp" +#include "common_test_utils/test_constants.hpp" + +namespace ov { +namespace test { +namespace snippets { + + +namespace { + +const std::vector inputShape = { + ov::Shape{1, 128, 3, 16}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmax, TransposeSoftmax, + ::testing::Combine( + ::testing::Values(inputShape), + ::testing::Values(std::vector{0, 2, 3, 1}), + ::testing::Values(-1), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TransposeSoftmax::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmaxEltwise, TransposeSoftmaxEltwise, + ::testing::Combine( + ::testing::Values(inputShape), + ::testing::Values(std::vector{0, 2, 3, 1}), + ::testing::Values(-1), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + TransposeSoftmax::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp index 64042a3b01a..3b4db9cc4d3 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp @@ -10,7 +10,7 @@ namespace test { namespace snippets { namespace { -const std::vector> input_shapes = { +const std::vector> input_shapes = { { {5, 5, 256, 1}, {5, 5, 256, 1} }, { {5, 5, 16, 35}, {5, 5, 16, 35} }, { {5, 5, 256, 1}, {5, 5, 256, 35} }, @@ -26,7 +26,6 @@ const std::vector> input_shapes = { { {5, 5, 35, 17}, {5, 5, 35, 17} }, { {5, 5, 35, 17}, {5, 5, 1, 17} }, - { {5, 5, 35, 18}, {5, 5, 35, 18} }, { {5, 5, 35, 18}, {5, 5, 1, 18} }, }; diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/mha.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/mha.cpp index 4f3578b9106..4222cb9b975 100644 --- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/mha.cpp @@ -25,6 +25,7 @@ typedef std::tuple< std::vector, // Input precisions std::vector, // MatMul input #0 precisions size_t, // pattern type # + std::string, // Expected node std::string // Device name > MHATuple; @@ -155,8 +156,9 @@ public: std::vector inputPrecisions; std::vector matMulIn0Precisions; size_t patternType; + std::string expectedNode; std::string targetName; - std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetName) = obj.param; + std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetName) = obj.param; std::ostringstream results; results << "IS=("; @@ -173,6 +175,7 @@ public: results << "InPRC" << std::to_string(i) << "=" << inputPrecisions[i] << "_"; } results << "patternType=" << patternType; + results << "expect=" << expectedNode; results << "targetDevice=" << targetName; return results.str(); @@ -195,7 +198,8 @@ protected: std::vector inputPrecisions; std::vector matMulIn0Precisions; size_t patternType; - std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetDevice) = this->GetParam(); + std::string expectedNode; + std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetDevice) = this->GetParam(); init_input_shapes(inputShapes); @@ -223,7 +227,8 @@ TEST_P(MHATest, CompareWithRefs) { std::vector inputPrecisions; std::vector matMulIn0Precisions; size_t patternType; - std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetDevice) = this->GetParam(); + std::string expectedNode; + std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetDevice) = this->GetParam(); if (inputPrecisions[0] == ElementType::bf16 && !InferenceEngine::with_cpu_x86_bfloat16()) GTEST_SKIP(); @@ -232,7 +237,7 @@ TEST_P(MHATest, CompareWithRefs) { GTEST_SKIP(); run(); - CheckNumberOfNodesWithType(compiledModel, "MHA", 1); + CheckNumberOfNodesWithType(compiledModel, expectedNode, 1); } namespace { @@ -247,11 +252,6 @@ std::vector> inputShapes = { {{1, 204, 13, 212}, {1, 204, 13, 212}, {1, 1, 1, 204}, {1, 204, 13, 212}}, }; -std::vector> inputPrecisions = { - { ElementType::f32, ElementType::f32, ElementType::f32, ElementType::f32 }, - { ElementType::bf16, ElementType::bf16, ElementType::bf16, ElementType::bf16 }, -}; - std::vector> matMulIn0Precisions = { {}, }; @@ -260,15 +260,26 @@ std::vector patternTypes = { 0, 1 }; -INSTANTIATE_TEST_SUITE_P(smoke_MHA, MHATest, +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHATest, ::testing::Combine( ::testing::ValuesIn(static_shapes_to_test_representation(inputShapes)), - ::testing::ValuesIn(inputPrecisions), + ::testing::Values(std::vector{ ElementType::f32, ElementType::f32, ElementType::f32, ElementType::f32 }), ::testing::ValuesIn(matMulIn0Precisions), ::testing::ValuesIn(patternTypes), + ::testing::Values("Subgraph"), ::testing::Values(CommonTestUtils::DEVICE_CPU)), MHATest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_MHA, MHATest, + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inputShapes)), + ::testing::Values(std::vector{ ElementType::bf16, ElementType::bf16, ElementType::bf16, ElementType::bf16 }), + ::testing::ValuesIn(matMulIn0Precisions), + ::testing::ValuesIn(patternTypes), + ::testing::Values("MHA"), // Snippets don't support BF16 MHA pattern yet + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MHATest::getTestCaseName); + } // namespace static std::shared_ptr initMHAQuantSubgraph0(std::vector& inputDynamicShapes, std::vector& inputPrecisions, @@ -425,7 +436,8 @@ public: std::vector matMulIn0Precisions; size_t patternType; std::string targetName; - std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetName) = obj.param; + std::string expectedNode; + std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetName) = obj.param; std::ostringstream results; results << "IS=("; @@ -445,6 +457,7 @@ public: results << "MatMulIn0PRC" << std::to_string(i) << "=" << matMulIn0Precisions[i] << "_"; } results << "patternType=" << patternType; + results << "expect=" << expectedNode; results << "targetDevice=" << targetName; return results.str(); @@ -474,7 +487,8 @@ protected: std::vector inputPrecisions; std::vector matMulIn0Precisions; size_t patternType; - std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetDevice) = this->GetParam(); + std::string expectedNode; + std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetDevice) = this->GetParam(); init_input_shapes(inputShapes); @@ -493,7 +507,8 @@ TEST_P(MHAQuantTest, CompareWithRefs) { std::vector inputPrecisions; std::vector matMulIn0Precisions; size_t patternType; - std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetDevice) = this->GetParam(); + std::string expectedNode; + std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetDevice) = this->GetParam(); if (inputPrecisions[0] == ElementType::bf16 && !InferenceEngine::with_cpu_x86_bfloat16()) GTEST_SKIP(); @@ -502,7 +517,7 @@ TEST_P(MHAQuantTest, CompareWithRefs) { GTEST_SKIP(); run(); - CheckNumberOfNodesWithType(compiledModel, "MHA", 1); + CheckNumberOfNodesWithType(compiledModel, expectedNode, 1); } namespace { @@ -538,6 +553,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant, MHAQuantTest, ::testing::ValuesIn(inputPrecisionsQuant), ::testing::ValuesIn(matMulIn0PrecisionsQuant), ::testing::ValuesIn(patternTypesQuant), + ::testing::Values("MHA"), // Snippets don't support Quantized MHA pattern yet ::testing::Values(CommonTestUtils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp b/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp index aee3aff68bf..f3eaa9a38d6 100644 --- a/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp +++ b/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp @@ -6,7 +6,7 @@ #include #include #include -#include "snippets/pass/collapse_subgraph.hpp" +#include "snippets/pass/tokenization.hpp" namespace ov { namespace test { @@ -19,11 +19,17 @@ public: manager.register_pass(); manager.register_pass(); manager.register_pass(); + // + // todo: This is a temporary work-around. remove when MatMul tokenization is supported through general pipeline + manager.get_pass_config()->set_callback( + [](const std::shared_ptr& n) -> bool { + return ov::is_type(n); + }); } }; TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipAfterInputsMatMulEltwise) { - const auto &f = MatMulEltwiseBranchesFunction(std::vector {{1, 3, 4, 4}, {1, 3, 4, 4}}); + const auto &f = MatMulEltwiseBranchesFunction(std::vector {{1, 3, 4, 4}, {1, 3, 4, 4}}); function = f.getOriginal(); // Fully tokenizable, since inputs are followed by MatMul function_ref = f.getReference(); @@ -34,7 +40,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipConvFused_ConvMulActivation) std::vector> eltwiseOps {std::make_shared(), std::make_shared(), std::make_shared()}; - std::vector inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}}; + std::vector inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}}; const auto &f = ConvMulActivationFunction(inputShapes, eltwiseOps); function = f.getOriginal(); // Fully tokenizable, since Mul with 2 inputs isn't fused into Convolution @@ -46,7 +52,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_SkipConvFused_ConvSumActivation) { std::vector> eltwiseOps {std::make_shared(), std::make_shared(), std::make_shared()}; - std::vector inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}}; + std::vector inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}}; const auto &f = ConvMulActivationFunction(inputShapes, eltwiseOps); function = f.getOriginal(); // Not tokenizable, since Add + Eltwises can be fused into Convolution diff --git a/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp b/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp index e6f83cad753..518d2dfb1cc 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp @@ -6,7 +6,7 @@ #include "common_test_utils/ngraph_test_utils.hpp" #include "snippets/pass/fq_decomposition.hpp" -#include "snippets/pass/collapse_subgraph.hpp" +#include "snippets/pass/tokenization.hpp" #include "fake_quantize_function.hpp" #include "snippets/op/subgraph.hpp" #include "ngraph_transformations/snippets_mark_skipped.hpp" diff --git a/src/tests/functional/plugin/shared/include/snippets/add.hpp b/src/tests/functional/plugin/shared/include/snippets/add.hpp index 7499d8ade45..1895d204df6 100644 --- a/src/tests/functional/plugin/shared/include/snippets/add.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/add.hpp @@ -19,6 +19,14 @@ typedef std::tuple< std::string // Target Device > AddParams; +typedef std::tuple< + std::vector, // Input 0, Input 1 Shape + ov::element::Type, // Element type + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> AddParamsPair; + typedef std::tuple< ov::Shape, // Input 0 Shape ov::element::Type, // Element type @@ -49,6 +57,15 @@ protected: void SetUp() override; }; +// repack AddPair input shapes into shape vector to cover some cases easier +class AddPair : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; +}; + } // namespace snippets } // namespace test } // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/convert.hpp b/src/tests/functional/plugin/shared/include/snippets/convert.hpp index bd4d7641711..fe534480fc4 100644 --- a/src/tests/functional/plugin/shared/include/snippets/convert.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/convert.hpp @@ -11,7 +11,7 @@ namespace test { namespace snippets { typedef std::tuple< - std::vector, // InputShapes + std::vector, // InputShapes std::pair, std::vector>, // Input and Output data types for Converts size_t, // Expected num nodes size_t, // Expected num subgraphs diff --git a/src/tests/functional/plugin/shared/include/snippets/matmul.hpp b/src/tests/functional/plugin/shared/include/snippets/matmul.hpp new file mode 100644 index 00000000000..bfa2a82921f --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/matmul.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + std::vector, // Input Shapes + ov::element::Type, // Element type + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> MatMulParams; + +typedef std::tuple< + std::vector, // Input Shapes + size_t , // Transpose position + ov::element::Type, // Element type + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> TransposeMatMulParams; + +class MatMul : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +class MatMulBias : public MatMul { +protected: + void SetUp() override; +}; + +class ExplicitTransposeMatMul : public MatMul { +protected: + void SetUp() override; +}; + +class ExplicitTransposeMatMulBias : public MatMul { +protected: + void SetUp() override; +}; + +class ExplicitTransposeMulMatMulBias : public MatMul { +protected: + void SetUp() override; +}; + +class TransposeMatMul : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/mha.hpp b/src/tests/functional/plugin/shared/include/snippets/mha.hpp new file mode 100644 index 00000000000..9f95dcc30ac --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/mha.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + std::vector, // Input shapes + bool, // With Multiply + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> MHAParams; + + +class MHA : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; + + void generate_inputs(const std::vector& targetInputStaticShapes) override; +}; + +class MHASelect : public MHA { +protected: + void SetUp() override; + + void generate_inputs(const std::vector& targetInputStaticShapes) override; +}; + +class MHAWOTransposeOnInputs : public MHA { +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/include/snippets/select.hpp b/src/tests/functional/plugin/shared/include/snippets/select.hpp new file mode 100644 index 00000000000..e8e15ab97e4 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/select.hpp @@ -0,0 +1,59 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::Shape, // Input 0 Shape + ov::Shape, // Input 1 Shape + ov::Shape, // Input 2 Shape + ov::element::Type, // Element type + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> SelectParams; + +typedef std::tuple< + ov::Shape, // Input 0 Shape + ov::Shape, // Input 1 Shape + ov::Shape, // Input 2 Shape + ov::Shape, // Input 3 Shape + ov::element::Type, // Element type + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> BroadcastSelectParams; + +class Select : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; + + void generate_inputs(const std::vector& targetInputStaticShapes) override; +}; + +class BroadcastSelect : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; + + void generate_inputs(const std::vector& targetInputStaticShapes) override; +}; + + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/include/snippets/softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp new file mode 100644 index 00000000000..ca3f77e4319 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::Shape, // Input 0 Shape + int, // Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> SoftmaxParams; + +typedef std::tuple< + std::pair, // Input Shapes + int, // Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> AddSoftmaxParams; + +class Softmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +class AddSoftmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp b/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp index 2bb61b3b2b7..ce1fd7a8b5b 100644 --- a/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp @@ -19,6 +19,15 @@ typedef std::tuple< std::string // Target Device > ThreeInputsEltwiseParams; +typedef std::tuple< + InputShape, // Input 0 Shape + InputShape, // Input 1 Shape + InputShape, // Input 2 Shape + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device + > ThreeInputsEltwiseDynamicParams; + class ThreeInputsEltwise : public testing::WithParamInterface, virtual public ov::test::SnippetsTestsCommon { public: @@ -28,7 +37,6 @@ protected: void SetUp() override; }; - } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/functional/plugin/shared/include/snippets/transpose.hpp b/src/tests/functional/plugin/shared/include/snippets/transpose.hpp new file mode 100644 index 00000000000..e1491ebe8b1 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/transpose.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + ov::PartialShape, // Input 0 Shape + std::vector, // Transpose order + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> TransposeParams; + +class Transpose : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/transpose_matmul.hpp b/src/tests/functional/plugin/shared/include/snippets/transpose_matmul.hpp new file mode 100644 index 00000000000..f949e9df9d5 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/transpose_matmul.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + std::vector, // Input Shapes + size_t , // Transpose position + ov::element::Type, // Element type + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> TransposeMatMulParams; + +class TransposeMatMul : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp new file mode 100644 index 00000000000..952b7528a00 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/snippets_test_utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +typedef std::tuple< + std::vector, // Input shapes + std::vector, // Transpose Order + int64_t, // Softmax Axis + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> TransposeSoftmaxParams; + + +class TransposeSoftmax : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); + +protected: + void SetUp() override; +}; + +class TransposeSoftmaxEltwise : public TransposeSoftmax { +protected: + void SetUp() override; +}; + + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp index 0a209de2fe9..4284ceacfa4 100644 --- a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp @@ -11,7 +11,7 @@ namespace test { namespace snippets { typedef std::tuple< - std::vector, // Input Shape All shapes + std::vector, // Input Shape All shapes size_t, // Expected num nodes size_t, // Expected num subgraphs std::string // Target Device diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp index beb85401f52..bb6a5fbee60 100644 --- a/src/tests/functional/plugin/shared/src/snippets/add.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp @@ -5,6 +5,8 @@ #include "common_test_utils/common_utils.hpp" #include "snippets/add.hpp" #include "subgraph_simple.hpp" +#include "ngraph_functions/builders.hpp" +#include "functional_test_utils/skip_tests_config.hpp" namespace ov { namespace test { @@ -76,6 +78,38 @@ void AddRollConst::SetUp() { setInferenceType(type); } +std::string AddPair::getTestCaseName(testing::TestParamInfo obj) { + std::vector input_shapes; + ov::element::Type type; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(input_shapes, type, num_nodes, num_subgraphs, targetDevice) = obj.param; + if (input_shapes.size() != 2) + IE_THROW() << "Invalid input shapes vector size"; + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(input_shapes[0]) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(input_shapes[1]) << "_"; + result << "T=" << type << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void AddPair::SetUp() { + std::vector input_shapes; + ov::element::Type type; + std::tie(input_shapes, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + std::vector is; + for (const auto& s : input_shapes) { + is.emplace_back(InputShape {{}, {s, }}); + } + init_input_shapes(is); + auto f = ov::test::snippets::AddFunction({input_shapes[0], input_shapes[1]}); + function = f.getOriginal(); + setInferenceType(type); +} + TEST_P(Add, CompareWithRefImpl) { run(); validateNumSubgraphs(); @@ -91,6 +125,10 @@ TEST_P(AddRollConst, CompareWithRefImpl) { validateNumSubgraphs(); } +TEST_P(AddPair, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} } // namespace snippets } // namespace test diff --git a/src/tests/functional/plugin/shared/src/snippets/convert.cpp b/src/tests/functional/plugin/shared/src/snippets/convert.cpp index b4c5c840cb6..60419d28b2f 100644 --- a/src/tests/functional/plugin/shared/src/snippets/convert.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/convert.cpp @@ -12,7 +12,7 @@ namespace test { namespace snippets { std::string Convert::getTestCaseName(testing::TestParamInfo obj) { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::string targetDevice; size_t num_nodes, num_subgraphs; @@ -21,7 +21,7 @@ std::string Convert::getTestCaseName(testing::TestParamInfo inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); - + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertFunction(inputShape, types.first[0], types.second[0]); function = f.getOriginal(); output_type = types.second.front(); @@ -85,11 +84,10 @@ void Convert::generate_inputs(const std::vector& targetInputStaticSha } void ConvertInput::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); - + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertInputFunction(inputShape, types.first[0], types.second[0]); function = f.getOriginal(); } @@ -125,10 +123,10 @@ parameters ConvertInput::generate_params_random() const { } void ConvertOutput::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertOutputFunction(inputShape, types.first[0], types.second[0]); function = f.getOriginal(); @@ -136,10 +134,10 @@ void ConvertOutput::SetUp() { } void ConvertStub::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertStubFunction(inputShape, types.first[0], types.second[0]); function = f.getOriginal(); @@ -147,40 +145,40 @@ void ConvertStub::SetUp() { } void ConvertPartialInputsAndResults::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertPartialInputsAndResultsFunction(inputShape, types.first, types.second); function = f.getOriginal(); } void ConvertManyOnInputs::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertManyOnInputsFunction(inputShape, types.first); function = f.getOriginal(); } void ConvertManyOnOutputs::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertManyOnOutputsFunction(inputShape, types.first); function = f.getOriginal(); } void ConvertManyOnInputOutput::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertManyOnInputOutputFunction(inputShape, types.first, types.second); function = f.getOriginal(); diff --git a/src/tests/functional/plugin/shared/src/snippets/matmul.cpp b/src/tests/functional/plugin/shared/src/snippets/matmul.cpp new file mode 100644 index 00000000000..36782e59ad7 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/matmul.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/matmul.hpp" +#include "subgraph_matmul.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string MatMul::getTestCaseName(testing::TestParamInfo obj) { + std::vector input_shapes; + ov::element::Type elem_type; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(input_shapes, elem_type, num_nodes, num_subgraphs, targetDevice) = obj.param; + std::ostringstream result; + for (size_t i = 0; i < input_shapes.size(); i++) + result << "IS[" << i <<"]=" << CommonTestUtils::partialShape2str({input_shapes[i]}) << "_"; + result << "T=" << elem_type << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void MatMul::SetUp() { + std::vector input_shapes; + ov::element::Type elem_type; + std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(input_shapes)); + + auto f = ov::test::snippets::MatMulFunction(input_shapes); + function = f.getOriginal(); + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void MatMulBias::SetUp() { + std::vector input_shapes; + ov::element::Type elem_type; + std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(input_shapes)); + + auto f = ov::test::snippets::MatMulBiasFunction(input_shapes); + function = f.getOriginal(); + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void ExplicitTransposeMatMul::SetUp() { + std::vector input_shapes; + ov::element::Type elem_type; + std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(input_shapes)); + + auto f = ov::test::snippets::TransposeMatMulFunction(input_shapes); + function = f.getOriginal(); + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void ExplicitTransposeMatMulBias::SetUp() { + std::vector input_shapes; + ov::element::Type elem_type; + std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(input_shapes)); + + auto f = ov::test::snippets::TransposeMatMulBiasFunction(input_shapes); + function = f.getOriginal(); + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void ExplicitTransposeMulMatMulBias::SetUp() { + std::vector input_shapes; + ov::element::Type elem_type; + std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(input_shapes)); + + auto f = ov::test::snippets::TransposeMulMatMulBiasFunction(input_shapes); + function = f.getOriginal(); + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +std::string TransposeMatMul::getTestCaseName(testing::TestParamInfo obj) { + std::vector input_shapes; + size_t transpose_position; + ov::element::Type elem_type; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(input_shapes, transpose_position, elem_type, num_nodes, num_subgraphs, targetDevice) = obj.param; + if (input_shapes.size() != 2) + IE_THROW() << "Invalid input shapes vector size"; + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_"; + result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_"; + result << "Pos=" << transpose_position << "_"; + result << "T=" << elem_type << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void TransposeMatMul::SetUp() { + std::vector input_shapes; + size_t transpose_position; + ov::element::Type elem_type; + std::tie(input_shapes, transpose_position, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(input_shapes)); + + auto f = ov::test::snippets::Transpose0213MatMulFunction(input_shapes, transpose_position); + function = f.getOriginal(); + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +TEST_P(MatMul, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(MatMulBias, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ExplicitTransposeMatMul, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ExplicitTransposeMatMulBias, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(ExplicitTransposeMulMatMulBias, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(TransposeMatMul, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp index 1061a2a4f1b..e0b4490ecbd 100644 --- a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp @@ -27,10 +27,10 @@ std::string MaxNumParamsEltwise::getTestCaseName(testing::TestParamInfoGetParam(); - std::vector expandedShapes(10, inputShape); + std::vector expandedShapes(10, inputShape); std::vector input_shapes; for (const auto& s : expandedShapes) { - input_shapes.emplace_back(InputShape {{}, {s, }}); + input_shapes.emplace_back(InputShape {{}, {s.get_shape(), }}); } init_input_shapes(input_shapes); diff --git a/src/tests/functional/plugin/shared/src/snippets/mha.cpp b/src/tests/functional/plugin/shared/src/snippets/mha.cpp new file mode 100644 index 00000000000..bb8d7e585a2 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/mha.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/mha.hpp" +#include "subgraph_mha.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" +#include + +namespace ov { +namespace test { +namespace snippets { + +std::string MHA::getTestCaseName(testing::TestParamInfo obj) { + std::vector inputShapes; + bool withMul; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, withMul, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + for (size_t i = 0; i < inputShapes.size(); ++i) + result << "IS[" << i << "]=" << CommonTestUtils::partialShape2str({inputShapes[i]}) << "_"; + result << "Mul=" << withMul << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void MHA::SetUp() { + std::vector inputShapes; + bool withMul; + std::tie(inputShapes, withMul, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(inputShapes)); + + auto f = ov::test::snippets::MHAFunction(inputDynamicShapes, withMul); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void MHA::generate_inputs(const std::vector& targetInputStaticShapes) { + inputs.clear(); + const auto& model_inputs = function->inputs(); + for (int i = 0; i < model_inputs.size(); ++i) { + const auto& model_input = model_inputs[i]; + ov::Tensor tensor; + tensor = ov::test::utils::create_and_fill_tensor_normal_distribution(model_input.get_element_type(), targetInputStaticShapes[i], 1.0f, 0.5f); + inputs.insert({model_input.get_node_shared_ptr(), tensor}); + } +} + +void MHASelect::SetUp() { + std::vector inputShapes; + bool withMul; + std::tie(inputShapes, withMul, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(inputShapes)); + + auto f = ov::test::snippets::MHASelectFunction(inputDynamicShapes); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void MHASelect::generate_inputs(const std::vector& targetInputStaticShapes) { + inputs.clear(); + auto model_inputs = function->inputs(); + for (auto& model_input : model_inputs) { + const auto node_input = model_input.get_node_shared_ptr(); + const auto name = node_input->get_friendly_name(); + ov::Tensor tensor; + int seed = 0; + if (name.find("less") != std::string::npos) { + tensor = ov::test::utils::create_and_fill_tensor(model_input.get_element_type(), model_input.get_shape(), 5 + seed, -2, 10, seed++); + } else { + tensor = ov::test::utils::create_and_fill_tensor_normal_distribution(model_input.get_element_type(), model_input.get_shape(), 1.0f, 0.5f); + } + inputs.insert({node_input, tensor}); + } +} + +void MHAWOTransposeOnInputs::SetUp() { + std::vector inputShapes; + bool withMul; + std::tie(inputShapes, withMul, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(inputShapes)); + + auto f = ov::test::snippets::MHAWOTransposeOnInputsFunction(inputDynamicShapes); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + + +TEST_P(MHA, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(MHASelect, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(MHAWOTransposeOnInputs, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/select.cpp b/src/tests/functional/plugin/shared/src/snippets/select.cpp new file mode 100644 index 00000000000..a2814a57890 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/select.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" +#include "snippets/select.hpp" +#include "subgraph_simple.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { +void generate_data(std::map, ov::Tensor>& data_inputs, const std::vector>& model_inputs) { + data_inputs.clear(); + auto tensor_bool = ov::test::utils::create_and_fill_tensor(model_inputs[0].get_element_type(), model_inputs[0].get_shape(), 3, -1, 2); + auto tensor0 = ov::test::utils::create_and_fill_tensor(model_inputs[1].get_element_type(), model_inputs[1].get_shape(), 10, -10, 2); + auto tensor1 = ov::test::utils::create_and_fill_tensor(model_inputs[2].get_element_type(), model_inputs[2].get_shape(), 10, 0, 2); + data_inputs.insert({model_inputs[0].get_node_shared_ptr(), tensor_bool}); + data_inputs.insert({model_inputs[1].get_node_shared_ptr(), tensor0}); + data_inputs.insert({model_inputs[2].get_node_shared_ptr(), tensor1}); +} +} // namespace + +std::string Select::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes0, inputShapes1, inputShapes2; + ov::element::Type type; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes0, inputShapes1, inputShapes2, type, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; + result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_"; + result << "T=" << type << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Select::SetUp() { + ov::Shape inputShape0, inputShape1, inputShape2; + ov::element::Type type; + std::tie(inputShape0, inputShape1, inputShape2, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation({inputShape0, inputShape1, inputShape2})); + + auto f = ov::test::snippets::SelectFunction({inputShape0, inputShape1, inputShape2}); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void Select::generate_inputs(const std::vector& targetInputStaticShapes) { + generate_data(inputs, function->inputs()); +} + +std::string BroadcastSelect::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes0, inputShapes1, inputShapes2, broadcastShape; + ov::element::Type type; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes0, inputShapes1, inputShapes2, broadcastShape, type, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_"; + result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_"; + result << "BS=" << CommonTestUtils::vec2str(broadcastShape) << "_"; + result << "T=" << type << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void BroadcastSelect::SetUp() { + ov::Shape inputShape0, inputShape1, inputShape2, broadcastShape; + ov::element::Type type; + std::tie(inputShape0, inputShape1, inputShape2, broadcastShape, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation({inputShape0, inputShape1, inputShape2})); + + auto f = ov::test::snippets::BroadcastSelectFunction({inputShape0, inputShape1, inputShape2}, broadcastShape); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void BroadcastSelect::generate_inputs(const std::vector& targetInputStaticShapes) { + generate_data(inputs, function->inputs()); +} + +TEST_P(Select, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(BroadcastSelect, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp new file mode 100644 index 00000000000..13b45283278 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/softmax.hpp" +#include "subgraph_softmax.hpp" +#include "ngraph_functions/builders.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string Softmax::getTestCaseName(testing::TestParamInfo obj) { + ov::Shape inputShapes; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Softmax::SetUp() { + ov::Shape inputShape; + int axis; + std::tie(inputShape, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShape, }}}); + + auto f = ov::test::snippets::SoftmaxFunction({inputShape}, axis); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +std::string AddSoftmax::getTestCaseName(testing::TestParamInfo obj) { + std::pair inputShapes; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes.first) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes.second) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void AddSoftmax::SetUp() { + std::pair inputShapes; + int axis; + std::tie(inputShapes, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{}, {inputShapes.first, }}, {{}, {inputShapes.second, }}}); + + auto f = ov::test::snippets::AddSoftmaxFunction({inputShapes.first, inputShapes.second}, axis); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +TEST_P(Softmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(AddSoftmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp index 0c601cc8ebe..ad8db673983 100644 --- a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp @@ -5,6 +5,7 @@ #include "common_test_utils/common_utils.hpp" #include "snippets/three_inputs_eltwise.hpp" #include "subgraph_simple.hpp" +#include "functional_test_utils/skip_tests_config.hpp" namespace ov { namespace test { diff --git a/src/tests/functional/plugin/shared/src/snippets/transpose.cpp b/src/tests/functional/plugin/shared/src/snippets/transpose.cpp new file mode 100644 index 00000000000..c5886fe74a8 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/transpose.cpp @@ -0,0 +1,52 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/transpose.hpp" +#include "subgraph_transpose.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string Transpose::getTestCaseName(testing::TestParamInfo obj) { + ov::PartialShape inputShape; + std::vector order; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShape, order, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::partialShape2str({inputShape}) << "_"; + result << "Order=" << CommonTestUtils::vec2str(order) << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void Transpose::SetUp() { + ov::PartialShape inputShape; + std::vector order; + std::tie(inputShape, order, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes({{{inputShape}, {inputShape.get_shape(), }}}); + + auto f = ov::test::snippets::TransposeFunction({inputShape}, order); + function = f.getOriginal(); + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +TEST_P(Transpose, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/transpose_matmul.cpp b/src/tests/functional/plugin/shared/src/snippets/transpose_matmul.cpp new file mode 100644 index 00000000000..68a2140339f --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/transpose_matmul.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/transpose_matmul.hpp" +#include "subgraph_matmul.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string TransposeMatMul::getTestCaseName(testing::TestParamInfo obj) { + std::vector input_shapes; + size_t transpose_position; + ov::element::Type elem_type; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(input_shapes, transpose_position, elem_type, num_nodes, num_subgraphs, targetDevice) = obj.param; + if (input_shapes.size() != 2) + IE_THROW() << "Invalid input shapes vector size"; + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_"; + result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_"; + result << "Pos=" << transpose_position << "_"; + result << "T=" << elem_type << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void TransposeMatMul::SetUp() { + std::vector input_shapes; + size_t transpose_position; + ov::element::Type elem_type; + std::tie(input_shapes, transpose_position, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_partial_shapes_to_test_representation(input_shapes)); + + auto f = ov::test::snippets::Transpose0213MatMulFunction(input_shapes, transpose_position); + function = f.getOriginal(); + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +TEST_P(TransposeMatMul, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp new file mode 100644 index 00000000000..aecdd418f05 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/common_utils.hpp" +#include "snippets/transpose_softmax.hpp" +#include "subgraph_softmax.hpp" +#include "ngraph_functions/builders.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string TransposeSoftmax::getTestCaseName(testing::TestParamInfo obj) { + std::vector inputShapes; + std::vector order; + int axis; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(inputShapes, order, axis, num_nodes, num_subgraphs, targetDevice) = obj.param; + + std::ostringstream result; + for (size_t i = 0; i < inputShapes.size(); ++i) + result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_"; + result << "TO=" << CommonTestUtils::vec2str(order) << "_"; + result << "Axis=" << axis << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void TransposeSoftmax::SetUp() { + std::vector inputShapes; + std::vector order; + int64_t axis; + std::tie(inputShapes, order, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShapes)); + + auto f = ov::test::snippets::TransposeSoftmaxFunction(inputDynamicShapes, order, axis); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } +} + +void TransposeSoftmaxEltwise::SetUp() { + std::vector inputShapes; + std::vector order; + int64_t axis; + std::tie(inputShapes, order, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + init_input_shapes(static_shapes_to_test_representation(inputShapes)); + + auto f = ov::test::snippets::TransposeSoftmaxEltwiseFunction(inputDynamicShapes, order, axis); + function = f.getOriginal(); + + if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) { + configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE, + InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK}); + } + + abs_threshold = 1e-6; +} + +TEST_P(TransposeSoftmax, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + +TEST_P(TransposeSoftmaxEltwise, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} + + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp index 205587e1a30..81b3c93079c 100644 --- a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp @@ -11,14 +11,14 @@ namespace test { namespace snippets { std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo obj) { - std::vector inputShapes; + std::vector inputShapes; std::string targetDevice; size_t num_nodes, num_subgraphs; std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param; std::ostringstream result; for (auto i = 0; i < inputShapes.size(); i++) - result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_"; + result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i].get_shape()) << "_"; result << "#N=" << num_nodes << "_"; result << "#S=" << num_subgraphs << "_"; result << "targetDevice=" << targetDevice; @@ -26,9 +26,9 @@ std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo inputShape; + std::vector inputShape; std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(static_partial_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::TwoInputsAndOutputsFunction(inputShape); function = f.getOriginal(); } diff --git a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp index f4ccb9480e3..88a5f861aff 100644 --- a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp +++ b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp @@ -43,6 +43,11 @@ ov::runtime::Tensor generate(const std::shared_ptr& node, } namespace Activation { +// todo: this is a bug fixed! Merge it separately. +// Default parameters InputGenerateData(10, 20, 32768, 1) lead to input generation according to 10 + x/32768, +// where x {0, 20}, so all generated values are in the range [10, 10 + 6.1e-4]. +// Thus all the interval more-or-less fall within the uncertainty validation interval +// Fix let the range be at least 20x of resolution ov::runtime::Tensor generate(const ov::element::Type& elemType, const ov::Shape& targetShape, InputGenerateData inGenData = InputGenerateData(-1, 2*32768, 32768, 1)) { diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp index 68986aea9ca..8bf96f6c99a 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp @@ -18,7 +18,7 @@ class SnippetsFunctionBase { public: SnippetsFunctionBase() = delete; - explicit SnippetsFunctionBase(const std::vector& inputShapes, ov::element::Type_t precision = element::f32) + explicit SnippetsFunctionBase(const std::vector& inputShapes, ov::element::Type_t precision = element::f32) : input_shapes{inputShapes}, precision{precision} {}; std::shared_ptr getReference() const { @@ -53,7 +53,7 @@ protected: } const ov::element::Type_t precision; - const std::vector input_shapes; + const std::vector input_shapes; virtual void validate_function(const std::shared_ptr &f) const; }; @@ -67,7 +67,7 @@ protected: class SnippetsFunctionCustomizable : public SnippetsFunctionBase { public: SnippetsFunctionCustomizable() = delete; - SnippetsFunctionCustomizable(const std::vector& inputShapes, + SnippetsFunctionCustomizable(const std::vector& inputShapes, const std::vector>& customOps, const std::vector&& customOpsNumInputs); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp index a7c6bd34e0f..526234409b3 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp @@ -22,7 +22,7 @@ namespace snippets { // Result class ConvertFunction : public SnippetsFunctionBase { public: - explicit ConvertFunction(const std::vector& inputShapes, + explicit ConvertFunction(const std::vector& inputShapes, const ov::element::Type inType = ov::element::f32, const ov::element::Type outType = ov::element::u8) : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { @@ -45,7 +45,7 @@ protected: // Result class ConvertInputFunction : public SnippetsFunctionBase { public: - explicit ConvertInputFunction(const std::vector& inputShapes, + explicit ConvertInputFunction(const std::vector& inputShapes, const ov::element::Type inType = ov::element::f32, const ov::element::Type outType = ov::element::u8) : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { @@ -67,7 +67,7 @@ protected: // Result class ConvertOutputFunction : public SnippetsFunctionBase { public: - explicit ConvertOutputFunction(const std::vector& inputShapes, + explicit ConvertOutputFunction(const std::vector& inputShapes, const ov::element::Type inType = ov::element::f32, const ov::element::Type outType = ov::element::i8) : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { @@ -91,7 +91,7 @@ protected: // Result Result class ConvertStubFunction : public SnippetsFunctionBase { public: - explicit ConvertStubFunction(const std::vector& inputShapes, + explicit ConvertStubFunction(const std::vector& inputShapes, const ov::element::Type inType = ov::element::f32, const ov::element::Type outType = ov::element::i8) : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { @@ -117,7 +117,7 @@ protected: // Result2 class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase { public: - explicit ConvertPartialInputsAndResultsFunction(const std::vector& inputShapes, + explicit ConvertPartialInputsAndResultsFunction(const std::vector& inputShapes, const std::vector& inTypes = {ov::element::f32}, const std::vector& outTypes = {ov::element::f32}) : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) { @@ -142,7 +142,7 @@ protected: // Result class ConvertManyOnInputsFunction : public SnippetsFunctionBase { public: - explicit ConvertManyOnInputsFunction(const std::vector& inputShapes, const std::vector& types) + explicit ConvertManyOnInputsFunction(const std::vector& inputShapes, const std::vector& types) : SnippetsFunctionBase(inputShapes), types(types) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types"); @@ -165,7 +165,7 @@ protected: // Result Result class ConvertManyOnOutputsFunction : public SnippetsFunctionBase { public: - explicit ConvertManyOnOutputsFunction(const std::vector& inputShapes, const std::vector& types) + explicit ConvertManyOnOutputsFunction(const std::vector& inputShapes, const std::vector& types) : SnippetsFunctionBase(inputShapes), types(types) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types"); @@ -191,7 +191,7 @@ protected: // Result Result class ConvertManyOnInputOutputFunction : public SnippetsFunctionBase { public: - explicit ConvertManyOnInputOutputFunction(const std::vector& inputShapes, + explicit ConvertManyOnInputOutputFunction(const std::vector& inputShapes, const std::vector& inTypes, const std::vector& outTypes) : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) { diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp index b663c22671f..3cbcfdac4a5 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp @@ -28,7 +28,7 @@ namespace snippets { // Result class ConvMulActivationFunction : public SnippetsFunctionCustomizable { public: - explicit ConvMulActivationFunction(const std::vector& inputShapes, const std::vector>& customOps) + explicit ConvMulActivationFunction(const std::vector& inputShapes, const std::vector>& customOps) : SnippetsFunctionCustomizable(inputShapes, customOps, {2, 1, 1}) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); NGRAPH_CHECK(input_shapes[0].size() == 4, "Only 4D input shapes are currently supported"); @@ -36,6 +36,7 @@ public: ov::op::util::is_unary_elementwise_arithmetic(customOps[1]) && ov::op::util::is_unary_elementwise_arithmetic(customOps[2]), "Got invalid custom ops: expected binary and two unary operations"); + NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static(), "This test supports only static shapes"); } private: std::shared_ptr initOriginal() const override; diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp index fad086acf03..c583b5882ab 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp @@ -8,8 +8,10 @@ #include "snippets_helpers.hpp" #include "subgraph_simple.hpp" #include "subgraph_converts.hpp" +#include "subgraph_matmul.hpp" +#include "subgraph_softmax.hpp" -/* This file provides lowered representations (after the generate() was calles) for some simple functions. +/* This file provides lowered representations (after the generate() was called) for some simple functions. * This is required to test snippets lowering and optimization passes. All the functions are expected to be direct * descendants of SnippetsFunctionCustomizable (defined here) and one of the SnippetsFunctionBase derived classes * (declared in subgraph_simple.hpp). Note that the corresponding SnippetsFunctionBase child should use virtual inheritance @@ -22,7 +24,7 @@ namespace snippets { class AddFunctionLoweredBroadcast : public AddFunction { public: - explicit AddFunctionLoweredBroadcast(const std::vector& inputShapes, const std::vector& broadcastShapes) : + explicit AddFunctionLoweredBroadcast(const std::vector& inputShapes, const std::vector& broadcastShapes) : AddFunction(inputShapes), broadcast_shapes{broadcastShapes} { NGRAPH_CHECK(input_shapes.size() == broadcast_shapes.size(), "Broadcast shapes should have the same size as input_shapes"); @@ -37,10 +39,12 @@ private: class EltwiseThreeInputsLoweredFunction : public EltwiseThreeInputsFunction { public: - explicit EltwiseThreeInputsLoweredFunction(const std::vector& inputShapes, const std::vector& broadcastShapes) : + explicit EltwiseThreeInputsLoweredFunction(const std::vector& inputShapes, const std::vector& broadcastShapes) : EltwiseThreeInputsFunction(inputShapes), broadcast_shapes{broadcastShapes} { NGRAPH_CHECK(input_shapes.size() == broadcast_shapes.size(), "Broadcast shapes should have the same size as input_shapes"); + NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static() && input_shapes[2].is_static(), + "Broadcast shapes should have the same size as input_shapes"); } protected: @@ -49,6 +53,41 @@ private: std::vector broadcast_shapes; }; +class Transpose0213MatMulLoweredFunction : public Transpose0213MatMulFunction { +public: + explicit Transpose0213MatMulLoweredFunction(const std::vector& inputShapes, size_t position = 0) : + Transpose0213MatMulFunction(inputShapes, position) { + } +protected: + std::shared_ptr initLowered() const override; +}; + +class SoftmaxLoweredFunction : public SoftmaxFunction { +public: + explicit SoftmaxLoweredFunction(const std::vector& inputShapes, int axis) : SoftmaxFunction(inputShapes, axis) {} + +protected: + std::shared_ptr initLowered() const override; +}; + +// With LoopFusion pass +class AddSoftmaxLoweredFunction : public AddSoftmaxFunction { +public: + explicit AddSoftmaxLoweredFunction(const std::vector& inputShapes, int axis) : AddSoftmaxFunction(inputShapes, axis) {} + +protected: + std::shared_ptr initLowered() const override; +}; + +class BroadcastAddLoweredFunction : public BroadcastAddFunction { +public: + explicit BroadcastAddLoweredFunction(const std::vector& inputShapes, const PartialShape& targetShape) : + BroadcastAddFunction(inputShapes, targetShape) {} + +protected: + std::shared_ptr initLowered() const override; +}; + } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp new file mode 100644 index 00000000000..ea533334e80 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp @@ -0,0 +1,96 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/ngraph.hpp" +#include "./snippets_helpers.hpp" + +/* This file contains definitions of relatively simple functions (models) that will be used + * to test snippets-specific behavior. All the functions are expected to be direct descendants of + * SnippetsFunctionBase, so their constructors take only one (inputShapes) argument. + */ + +namespace ov { +namespace test { +namespace snippets { +/// Minimal graph to test MatMul support +/// Tokenized simply by starting subgraph, +// in1 in2 +// Matmul +// Result +// todo: remove once "no subgraph after input" limitation is relaxed +class MatMulFunction : public SnippetsFunctionBase { +public: + explicit MatMulFunction(const std::vector& inputShapes) + : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; +}; + +// As same as MatMulFunction but with biases +class MatMulBiasFunction : public SnippetsFunctionBase { +public: + explicit MatMulBiasFunction(const std::vector& inputShapes) + : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; + +/// Minimal graph to test MatMul+Transpose combinations. Transpose location is specified via the position argument: +/// 0 - before the first MatMul input; 1 - before the second MatMul input; 2 - after the MatMul output. +/// Tokenized simply by starting subgraph, +// in1 in2 +// Transpose / +// Matmul +// Result +class Transpose0213MatMulFunction : public SnippetsFunctionBase { +public: + explicit Transpose0213MatMulFunction(const std::vector& inputShapes, size_t position = 0) + : SnippetsFunctionBase(inputShapes), transpose_position(position) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + NGRAPH_CHECK(input_shapes[0].rank().get_length() == 4 && input_shapes[1].rank().get_length() == 4, + "Only rank 4 input shapes are supported by this test"); + NGRAPH_CHECK(transpose_position >=0 && transpose_position <= 2, "Got invalid transpose position"); + } +protected: + std::shared_ptr initOriginal() const override; + size_t transpose_position; +}; + +class TransposeMatMulFunction : public SnippetsFunctionBase { +public: + explicit TransposeMatMulFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; + +class TransposeMatMulBiasFunction : public SnippetsFunctionBase { +public: + explicit TransposeMatMulBiasFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; + +class TransposeMulMatMulBiasFunction : public SnippetsFunctionBase { +public: + explicit TransposeMulMatMulBiasFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 4, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp new file mode 100644 index 00000000000..309a32e9145 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp @@ -0,0 +1,131 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/ngraph.hpp" +#include "snippets_helpers.hpp" + + +/* The file contains graphs with different MHA-patterns: + * Skeleton on MHA-pattern is: + * \ / + * MatMul0 + * | + * Eltwise/Select/Reshape/FakeQuantize + * | + * Softmax + * | + * Eltwise/Select/Reshape/FakeQuantize + * \ / + * MatMul1 + */ + +namespace ov { +namespace test { +namespace snippets { + +/* Graph: + * Transpose1[0,2,3,1] Constant + * \ / + * Transpose0[0,2,1,3] Multiply [with_mul = true] + * \ / + * MatMul0 + * \ / + * Add + * Reshape0 + * Softmax + * Reshape1 Transpose2[0,2,1,3] + * \ / + * MatMul1 + * Transpose3[0,2,1,3] + */ +class MHAFunction : public SnippetsFunctionBase { +public: + explicit MHAFunction(const std::vector& inputShapes, bool with_mul = true) + : SnippetsFunctionBase(inputShapes), with_mul(with_mul) { + NGRAPH_CHECK(input_shapes.size() == 4, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + + bool with_mul = true; +}; + +/* Graph: + * Transpose1[0,2,1,3] Constant + * \ / + * Transpose0[0,2,1,3] Multiply + * \ / + * MatMul0 [transposed_b = true] + * \ / + * Add + * Reshape0 + * Softmax + * Reshape1 Transpose2[0,2,1,3] + * \ / + * MatMul1 + * Transpose3[0,2,1,3] + */ +class MHAMatMul0TransposeFunction : public SnippetsFunctionBase { +public: + explicit MHAMatMul0TransposeFunction(const std::vector& inputShapes) + : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 4, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; +}; + +/* Graph: + * Transpose1[0,2,3,1] Constant + * \ / + * Transpose0[0,2,1,3] Multiply + * \ \ / + * Broadcast Scalar MatMul0 + * \ | / + * Select + * Reshape0 + * Softmax + * Reshape1 Transpose2[0,2,1,3] + * \ / + * MatMul1 + * Transpose3[0,2,1,3] + */ +class MHASelectFunction : public SnippetsFunctionBase { +public: + explicit MHASelectFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 6, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; + +/* Graph: + * Constant + * \ / + * Multiply + * \ / + * MatMul0 + * | + * Softmax + * \ / + * MatMul1 + * | + * Transpose3[0,2,1,3] + */ +class MHAWOTransposeOnInputsFunction : public SnippetsFunctionBase { +public: + explicit MHAWOTransposeOnInputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp index 6ebc6acd7d7..b62719917ae 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp @@ -22,7 +22,7 @@ namespace snippets { // Result class AddFunction : public SnippetsFunctionBase { public: - explicit AddFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit AddFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -36,8 +36,9 @@ protected: // todo: remove Sinh once "no subgraph after input" limitation is relaxed class AddConstFunction : public SnippetsFunctionBase { public: - explicit AddConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit AddConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + NGRAPH_CHECK(input_shapes[0].is_static(), "This test supports only static shapes"); } protected: std::shared_ptr initOriginal() const override; @@ -53,8 +54,9 @@ protected: // The function is needed to check different input element types (model precision change) class AddRollConstFunction : public SnippetsFunctionBase { public: - explicit AddRollConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit AddRollConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + NGRAPH_CHECK(input_shapes[0].is_static(), "Only static shapes are supported"); } protected: std::shared_ptr initOriginal() const override; @@ -69,7 +71,7 @@ protected: // Result class EltwiseFunction : public SnippetsFunctionBase { public: - explicit EltwiseFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit EltwiseFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -84,7 +86,7 @@ protected: // Result class EltwiseThreeInputsFunction : public SnippetsFunctionBase { public: - explicit EltwiseThreeInputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit EltwiseThreeInputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); } protected: @@ -99,7 +101,7 @@ protected: // Result class EltwiseMaxNumParamsFunction : public SnippetsFunctionBase { public: - explicit EltwiseMaxNumParamsFunction(const std::vector& inputShapes) : + explicit EltwiseMaxNumParamsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes"); } @@ -115,7 +117,7 @@ protected: // Result class MatMulEltwiseBranchesFunction : public SnippetsFunctionBase { public: - explicit MatMulEltwiseBranchesFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit MatMulEltwiseBranchesFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); NGRAPH_CHECK(input_shapes[0].size() == 4 && input_shapes[1].size() == 4, "Only 4D input shapes are currently supported by this test"); @@ -123,6 +125,7 @@ public: // Note that single-element constant are not supported by the test, since they'll be converted // to snippets::op::Scalar. So a more comlex logics is required to produce reference function. NGRAPH_CHECK(input_shapes[0][1] == input_shapes[1][1], "Channel dimensions must be equal and != 1"); + NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static(), "This test supports only static shapes"); } protected: @@ -138,7 +141,7 @@ protected: // Result class EltwiseLogLoopFunction : public SnippetsFunctionBase { public: - explicit EltwiseLogLoopFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit EltwiseLogLoopFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -155,7 +158,7 @@ protected: // Result class EltwiseTwoResultsFunction : public SnippetsFunctionBase { public: - explicit EltwiseTwoResultsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit EltwiseTwoResultsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -172,12 +175,58 @@ protected: // Result class TwoInputsAndOutputsFunction : public SnippetsFunctionBase { public: - explicit TwoInputsAndOutputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit TwoInputsAndOutputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: std::shared_ptr initOriginal() const override; }; +/// Verify Select +// in0 in1 in2 +// \ | / +// Select +// Result +class SelectFunction : public SnippetsFunctionBase { +public: + explicit SelectFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; +}; +/// Verify Broadcast in passes +// in0 in1 +// Broadcast | +// \ / +// Add +// Result +class BroadcastAddFunction : public SnippetsFunctionBase { +public: + explicit BroadcastAddFunction(const std::vector& inputShapes, const PartialShape& targetShape) + : SnippetsFunctionBase(inputShapes), m_target_shape(targetShape) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + + PartialShape m_target_shape; +}; + +/// Verify Select + Broadcast +// in0 in1 in2 +// Broadcast | | +// \ | / +// Select +// Result +class BroadcastSelectFunction : public SelectFunction { +public: + explicit BroadcastSelectFunction(const std::vector& inputShapes, const PartialShape& targetShape) + : SelectFunction(inputShapes), m_target_shape(targetShape) {} +protected: + std::shared_ptr initOriginal() const override; + + PartialShape m_target_shape; +}; } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp new file mode 100644 index 00000000000..90cec1a1a3c --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/ngraph.hpp" +#include "./snippets_helpers.hpp" + +namespace ov { +namespace test { +namespace snippets { + +class SoftmaxFunction : public SnippetsFunctionBase { +public: + explicit SoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + int axis; +}; + +class AddSoftmaxFunction : public SnippetsFunctionBase { +public: + explicit AddSoftmaxFunction(const std::vector& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) { + NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + int axis; +}; + +class TransposeSoftmaxFunction : public SnippetsFunctionBase { +public: + explicit TransposeSoftmaxFunction(const std::vector& inputShapes, const std::vector& order, const int64_t axis) + : SnippetsFunctionBase(inputShapes), m_order(order), m_axis(axis) { + NGRAPH_CHECK(input_shapes.size() > 0, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + + std::vector m_order; + int64_t m_axis; +}; + +class TransposeSoftmaxEltwiseFunction : public TransposeSoftmaxFunction { +public: + explicit TransposeSoftmaxEltwiseFunction(const std::vector& inputShapes, const std::vector& order, const int64_t axis) + : TransposeSoftmaxFunction(inputShapes, order, axis) {} +protected: + std::shared_ptr initOriginal() const override; +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_transpose.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_transpose.hpp new file mode 100644 index 00000000000..b77ea54e257 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_transpose.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/ngraph.hpp" +#include "snippets_helpers.hpp" + +/* This file contains definitions of relatively simple functions (models) that will be used + * to test snippets-specific behavior. All the functions are expected to be direct descendants of + * SnippetsFunctionBase, so their constructors take only one (inputShapes) argument. + */ + +namespace ov { +namespace test { +namespace snippets { +/// Minimal graph to test Transpose support: Parameter->Sinh->Transpose->Result +/// Tokenized simply by starting subgraph, supported through TransposeDecomposition +// in1 Const(order) +// Transpose +// Result +class TransposeFunction : public SnippetsFunctionBase { +public: + explicit TransposeFunction(const std::vector& inputShapes, std::vector order) + : SnippetsFunctionBase(inputShapes), order(std::move(order)) { + NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + } +protected: + std::shared_ptr initOriginal() const override; + std::shared_ptr initReference() const override; + std::vector order; +}; +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp index ff7cdc986a5..8cec4a4aca9 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp @@ -15,11 +15,11 @@ void SnippetsFunctionBase::validate_function(const std::shared_ptr &f) co NGRAPH_CHECK(params.size() == input_shapes.size(), "Passed input shapes and produced function are inconsistent."); for (size_t i = 0; i < input_shapes.size(); i++) - NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_shape().begin()), + NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_partial_shape().begin()), "Passed input shapes and produced function are inconsistent."); } -SnippetsFunctionCustomizable::SnippetsFunctionCustomizable(const std::vector& inputShapes, +SnippetsFunctionCustomizable::SnippetsFunctionCustomizable(const std::vector& inputShapes, const std::vector>& customOps, const std::vector&& customOpsNumInputs) : SnippetsFunctionBase(inputShapes), custom_ops{customOps}, custom_ops_num_inputs{customOpsNumInputs} { diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp index ccf1ce4081e..9975f5185c1 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp @@ -13,7 +13,7 @@ namespace snippets { std::shared_ptr ConvMulActivationFunction::initOriginal() const { auto conv_param = std::make_shared(precision, input_shapes[0]); - const auto channels = input_shapes[0][1]; + const auto channels = static_cast(input_shapes[0][1].get_length()); ngraph::Shape strides(2, 1); std::vector pad_begin(2, 1), pad_end(2, 1); const Shape const_shape {channels, channels, 3, 3}; @@ -37,7 +37,7 @@ std::shared_ptr ConvMulActivationFunction::initReference() const { auto conv_param = std::make_shared(precision, input_shapes[0]); ngraph::Shape strides(2, 1); std::vector pad_begin(2, 1), pad_end(2, 1); - const auto channels = input_shapes[0][1]; + const auto channels = static_cast(input_shapes[0][1].get_length()); const Shape const_shape {channels, channels, 3, 3}; const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(const_shape), -10., 10.); auto weights = std::make_shared(precision, const_shape, const_values); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index 8fd664b1921..22b86982e9e 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -6,6 +6,7 @@ #include "common_test_utils/data_utils.hpp" #include #include "ngraph_functions/builders.hpp" +#include "snippets/pass/loop_helpers.hpp" namespace ov { namespace test { @@ -14,7 +15,7 @@ namespace snippets { std::shared_ptr AddFunctionLoweredBroadcast::initLowered() const { auto data0 = std::make_shared(precision, input_shapes[0]); std::shared_ptr add_input0 = nullptr; - if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].back()) { + if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].rbegin()->get_length()) { add_input0 = std::make_shared(data0, broadcast_shapes[0]); } else { add_input0 = std::make_shared(data0); @@ -22,18 +23,38 @@ std::shared_ptr AddFunctionLoweredBroadcast::initLowered() const { auto data1 = std::make_shared(precision, input_shapes[1]); std::shared_ptr add_input1 = nullptr; - if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].back()) { + if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].rbegin()->get_length()) { add_input1 = std::make_shared(data1, broadcast_shapes[1]); } else { add_input1 = std::make_shared(data1); } auto add = std::make_shared(add_input0, add_input1); auto store = std::make_shared(add); - return std::make_shared(NodeVector{store}, ParameterVector{data0, data1}); + ParameterVector input_params {data0, data1}; + auto model = std::make_shared(NodeVector{store}, input_params); + + // Create dummy scheduler to pass graph comparison tests + // Note that if there is more than one results, they should be reverted + ResultVector results({model->get_results()[0]}); + const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + std::vector apply_increments(input_params.size() + results.size(), true); + insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments); + auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0, + [](int64_t max_val, const PartialShape& ps) { + return std::max(ps[ps.size() - 2].get_length(), max_val); + }); + if (outer_WA > 1) { + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments); + } + return model; } std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() const { // todo: implement conversion between std::vector and std::vector - auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0], input_shapes[1], input_shapes[2]}); + auto input_params = ngraph::builder::makeParams(precision, + {input_shapes[0].get_shape(), + input_shapes[1].get_shape(), + input_shapes[2].get_shape()}); auto load_or_broadcastload = [&](size_t i) -> std::shared_ptr { // user specified that no broadcasting is required if (broadcast_shapes[i].empty()) { @@ -41,7 +62,7 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons // broadcasting is required: could be Load + BroadcastMove or BroiadcastLoad } else { // The last dim is processed by vector Tile, so BroadcastLoad is required if the last dim being broadcasted - if (input_shapes[i].back() == 1 && broadcast_shapes[i].back() != 1) { + if (input_shapes[i].rbegin()->get_length() == 1 && broadcast_shapes[i].back() != 1) { return std::make_shared(input_params[i], broadcast_shapes[i]); // Todo: Cover this logics with functional tests, Review FakeBroadcast Emitter // Broadcasting of other dims is handled by BroadcastMove. Strictly speaking, broadcasting is achieved via @@ -57,12 +78,6 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons const std::vector const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.); auto sub_scalar = std::make_shared(precision, Shape{1}, const_values[0]); std::shared_ptr sub_load; -// Todo: Uncomment when invalid read in vector tile will be fixed -// if (input_shapes[2].back() == 1) -// sub_load = std::make_shared(input_params[2]); -// else -// sub_load = std::make_shared(input_params[2]); -// remove when the code above is enabled: sub_load = std::make_shared(input_params[2]); auto sub = std::make_shared(sub_load, sub_scalar); std::shared_ptr sub_out; @@ -72,7 +87,334 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons sub_out = std::make_shared(sub, broadcast_shapes[2]); auto mul = std::make_shared(add, sub_out); auto store = std::make_shared(mul); - return std::make_shared(NodeVector{store}, input_params); + auto model = std::make_shared(NodeVector{store}, input_params); + + // Create dummy scheduler to pass graph comparison tests + // Note that if there is more than one results, they should be reverted + ResultVector results({model->get_results()[0]}); + const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + std::vector apply_increments(input_params.size() + results.size(), true); + const auto& inner_loop_end = insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments); + auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0, + [](int64_t max_val, const PartialShape& ps) { + return std::max(ps[ps.size() - 2].get_length(), max_val); + }); + if (outer_WA > 1) { + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments); + } + return model; +} + +std::shared_ptr Transpose0213MatMulLoweredFunction::initLowered() const { + ParameterVector data{std::make_shared(precision, input_shapes[0]), + std::make_shared(precision, input_shapes[1])}; + std::vector layout{0, 2, 1, 3}; + // Note: validity of transpose_position values is checked in Transpose0213MatMulSinhFunction constructor + if (transpose_position <= 1) { + auto &rt_info = data[transpose_position]->get_rt_info(); + rt_info["Layout"] = layout; + } + auto matmul = std::make_shared(data[0], data[1]); + if (transpose_position == 2) { + auto &rt_info = matmul->get_rt_info(); + rt_info["Layout"] = layout; + matmul->validate_and_infer_types(); + } + return std::make_shared(NodeVector{matmul}, data); +} + +std::shared_ptr SoftmaxLoweredFunction::initLowered() const { + auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape()}); + + const auto data = input_params.front(); + + const auto master_shape = input_shapes[0].get_shape(); + const auto shape_rank = master_shape.size(); + const auto dimension = shape_rank - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = 10; + const auto inner_dim = shape_rank - 1; + const auto inner_master_wa = static_cast(master_shape[inner_dim]); + const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1; + const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1; + const bool is_scalar = work_amount == 1; + + /* ====== ReduceMax decomposition ====== */ + + const auto vector_buffer_max = std::make_shared(); + const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data}); + + // we don't insert Fill here after load_max to verify because in generate() call Fill op is inserted only on vector representation + const auto load_max = std::make_shared(loop_max_begin->output(0), increment); + const auto max = std::make_shared(load_max, vector_buffer_max); + + std::vector apply_increments_max(3, false); + std::vector finalization_offsets_max(3, 0); + apply_increments_max[0] = data->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_max[0] = data->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_max_end = std::make_shared(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)}, + work_amount, increment, apply_increments_max, finalization_offsets_max); + + std::shared_ptr horizon_max = std::make_shared(max); + horizon_max->add_control_dependency(loop_max_end); + const auto prev_horizon_max = horizon_max; + if (!is_scalar) { + horizon_max = std::make_shared(horizon_max, horizon_max->get_input_partial_shape(0)); + } + + loop_max_begin->add_control_dependency(vector_buffer_max); + loop_max_end->add_control_dependency(max); + + /* =========================================== */ + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)}); + + const auto load_sub = std::make_shared(loop_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + // we don't insert Fill here after Exp to verify because in generate() call Fill op is inserted only on vector representation + const auto exp = std::make_shared(sub); + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + std::vector apply_increments_sum(2, false); + std::vector finalization_offsets_sum(2, 0); + apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment, + apply_increments_sum, finalization_offsets_sum); + loop_sum_end->add_control_dependency(sum); + + const auto horizon_sum = std::make_shared(sum); + horizon_sum->add_control_dependency(loop_sum_end); + + const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + + loop_sum_begin->add_control_dependency(vector_buffer_sum); + loop_sum_begin->add_control_dependency(horizon_max); + loop_sum_begin->add_control_dependency(prev_horizon_max); + + /* =========================================== */ + + /* ================== Div ==================== */ + + std::shared_ptr pow = std::make_shared(horizon_sum, -1); + const auto prev_pow = pow; + if (!is_scalar) { + pow = std::make_shared(pow, horizon_sum->get_input_partial_shape(0)); + } + + const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp}); + + const auto load_div = std::make_shared(loop_div_begin->output(0), increment); + const auto mul = std::make_shared(load_div, pow); + const auto store_div = std::make_shared(mul, increment); + + std::vector apply_increments_div(2, false); + std::vector finalization_offsets_div(2, 0); + apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_div_end = std::make_shared( + ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment, + apply_increments_div, finalization_offsets_div); + loop_div_begin->add_control_dependency(horizon_sum); + loop_div_begin->add_control_dependency(pow); + loop_div_begin->add_control_dependency(prev_pow); + + /* =========================================== */ + + const auto result = std::make_shared(loop_div_end); + if (has_outer_loop) { + const auto need_increment = input_shapes[0].get_shape()[outer_dim] != 1 && input_shapes[0].get_shape()[inner_dim] == 1; + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + const auto outer_loop_end = insertLoopEnd(NodeVector{result}, outer_loop_begin, 1, 1, std::vector{need_increment, need_increment}); + vector_buffer_max->add_control_dependency(outer_loop_begin); + } + + return std::make_shared(ResultVector{result}, input_params); +} +std::shared_ptr AddSoftmaxLoweredFunction::initLowered() const { + auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape(), input_shapes[1].get_shape()}); + + auto master_pshape = input_shapes[0]; + ov::PartialShape::broadcast_merge_into(master_pshape, input_shapes[1], op::AutoBroadcastType::NUMPY); + const auto master_shape = master_pshape.get_shape(); + const auto shape_rank = master_shape.size(); + const auto dimension = shape_rank - 1; + const auto work_amount = master_shape[dimension]; + const auto increment = 10; + const auto inner_dim = shape_rank - 1; + const auto inner_master_wa = static_cast(master_shape[inner_dim]); + const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1; + const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1; + const bool is_scalar = work_amount == 1; + + /* ================== Add + ReduceMax ==================== */ + + const auto vector_buffer_max = std::make_shared(); + const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(input_params); + + std::shared_ptr load0 = std::make_shared(loop_max_begin->output(0), increment); + if (!is_scalar && input_shapes[0].get_shape().back() == 1) { + auto new_shape = input_shapes[0].get_shape(); + new_shape[new_shape.size() - 1] = static_cast(inner_master_wa); + load0 = std::make_shared(loop_max_begin->output(0), new_shape); + } + std::shared_ptr load1 = std::make_shared(loop_max_begin->output(1), increment); + if (!is_scalar && input_shapes[1].get_shape().back() == 1) { + auto new_shape = input_shapes[1].get_shape(); + new_shape[new_shape.size() - 1] = static_cast(inner_master_wa); + load1 = std::make_shared(loop_max_begin->output(1), new_shape); + } + const auto add = std::make_shared(load0, load1); + const auto store = std::make_shared(add, increment); + + // we don't insert Fill here after load_max to verify because in generate() call Fill op is inserted only on vector representation + const auto max = std::make_shared(add, vector_buffer_max); + + std::vector apply_increments_max(3, false); + std::vector finalization_offsets_max(3, 0); + apply_increments_max[0] = input_shapes[0].get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_max[1] = input_shapes[1].get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_max[2] = master_shape[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_max[0] = input_shapes[0].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_max[1] = input_shapes[1].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_max[2] = master_shape[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_max_end = std::make_shared(ngraph::OutputVector{store, loop_max_begin->output(2)}, + work_amount, increment, apply_increments_max, finalization_offsets_max); + + std::shared_ptr horizon_max = std::make_shared(max); + horizon_max->add_control_dependency(loop_max_end); + const auto prev_horizon_max = horizon_max; + if (!is_scalar) { + horizon_max = std::make_shared(horizon_max, horizon_max->get_input_partial_shape(0)); + } + + loop_max_begin->add_control_dependency(vector_buffer_max); + loop_max_end->add_control_dependency(max); + + /* =========================================== */ + + const auto buffer_add = std::make_shared(loop_max_end->output(0)); + + /* === Sub + Exp + ReduceSum decomposition === */ + + const auto vector_buffer_sum = std::make_shared(); + const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_add->output(0)}); + + const auto load_sub = std::make_shared(loop_sum_begin->output(0), increment); + const auto sub = std::make_shared(load_sub, horizon_max); + // we don't insert Fill here after exp to verify because in generate() call Fill op is inserted only on vector representation + const auto exp = std::make_shared(sub); + const auto sum = std::make_shared(exp, vector_buffer_sum); + const auto store_exp = std::make_shared(exp, increment); + + std::vector apply_increments_sum(2, false); + std::vector finalization_offsets_sum(2, 0); + apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_sum_end = std::make_shared( + ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment, + apply_increments_sum, finalization_offsets_sum); + loop_sum_end->add_control_dependency(sum); + + const auto horizon_sum = std::make_shared(sum); + horizon_sum->add_control_dependency(loop_sum_end); + + const auto buffer_exp = std::make_shared(loop_sum_end->output(0)); + + loop_sum_begin->add_control_dependency(vector_buffer_sum); + loop_sum_begin->add_control_dependency(horizon_max); + loop_sum_begin->add_control_dependency(prev_horizon_max); + + /* =========================================== */ + + /* ================== Div ==================== */ + + std::shared_ptr pow = std::make_shared(horizon_sum, -1); + const auto prev_pow = pow; + if (!is_scalar) { + pow = std::make_shared(pow, horizon_sum->get_input_partial_shape(0)); + } + + const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp}); + + const auto load_div = std::make_shared(loop_div_begin->output(0), increment); + const auto mul = std::make_shared(load_div, pow); + const auto store_div = std::make_shared(mul, increment); + + std::vector apply_increments_div(2, false); + std::vector finalization_offsets_div(2, 0); + apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1; + finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0; + const auto loop_div_end = std::make_shared( + ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment, + apply_increments_div, finalization_offsets_div); + loop_div_begin->add_control_dependency(horizon_sum); + loop_div_begin->add_control_dependency(pow); + loop_div_begin->add_control_dependency(prev_pow); + + /* =========================================== */ + + const auto result = std::make_shared(loop_div_end); + if (has_outer_loop) { + const auto need_increment0 = input_shapes[0].get_shape()[outer_dim] != 1 && input_shapes[0].get_shape()[inner_dim] == 1; + const auto need_increment1 = input_shapes[1].get_shape()[outer_dim] != 1 && input_shapes[1].get_shape()[inner_dim] == 1; + const auto need_increment2 = master_shape[outer_dim] != 1 && master_shape[inner_dim] == 1; + const auto outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + const auto outer_loop_end = insertLoopEnd( + NodeVector{result}, outer_loop_begin, 1, 1, std::vector{need_increment0, need_increment1, need_increment2}); + vector_buffer_max->add_control_dependency(outer_loop_begin); + } + + return std::make_shared(ResultVector{result}, input_params); +} +std::shared_ptr BroadcastAddLoweredFunction::initLowered() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + ov::NodeVector datas = {data0, data1}; + auto last_dim = std::max(input_shapes[0].get_shape().back(), std::max(input_shapes[1].get_shape().back(), m_target_shape.get_shape().back())); + ov::NodeVector loads(datas.size(), nullptr); + for (auto i = 0; i < datas.size(); i++) { + if (input_shapes[i].get_shape().back() != last_dim) { + auto new_shape = input_shapes[i]; + new_shape[new_shape.size() - 1] = last_dim; + loads[i] = std::make_shared(datas[i], new_shape); + } else { + loads[i] = std::make_shared(datas[i]); + } + } + auto add = std::make_shared(loads[0], loads[1]); + auto store = std::make_shared(add); + auto model = std::make_shared(NodeVector{store}, ParameterVector{data0, data1}); + + // Create dummy scheduler to pass graph comparison tests + // Note that if there is more than one results, they should be reverted + ResultVector results({model->get_results()[0]}); + const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(datas); + std::vector apply_increments(datas.size() + results.size(), true); + insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments); + auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0, + [](int64_t max_val, const PartialShape& ps) { + return std::max(ps[ps.size() - 2].get_length(), max_val); + }); + if (outer_WA > 1) { + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(datas); + insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments); + } + return model; } } // namespace snippets } // namespace test diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp new file mode 100644 index 00000000000..af312a2ee2d --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp @@ -0,0 +1,92 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_matmul.hpp" +#include "common_test_utils/data_utils.hpp" +#include + +namespace ov { +namespace test { +namespace snippets { +std::shared_ptr MatMulFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto matmul = std::make_shared(data0, data1); + return std::make_shared(NodeVector{matmul}, ParameterVector{data0, data1}); +} +std::shared_ptr MatMulFunction::initReference() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto indata0 = std::make_shared(precision, data0->get_output_partial_shape(0)); + auto indata1 = std::make_shared(precision, data1->get_output_partial_shape(0)); + auto matmul = std::make_shared(NodeVector{data0, data1}, + std::make_shared(NodeVector{std::make_shared(indata0, indata1)}, + ParameterVector{indata0, indata1})); + return std::make_shared(NodeVector{matmul}, ParameterVector{data0, data1}); +} +std::shared_ptr MatMulBiasFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto matmul = std::make_shared(data0, data1); + auto data2 = std::make_shared(precision, input_shapes[2]); + auto bias = std::make_shared(matmul, data2); + return std::make_shared(NodeVector{bias}, ParameterVector{data0, data1, data2}); +} +std::shared_ptr Transpose0213MatMulFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto const_order = std::make_shared(ov::element::i32, Shape {4}, std::vector{0, 2, 1, 3}); + std::shared_ptr result; + switch (transpose_position) { + case 0: { + auto transpose = std::make_shared(data0, const_order); + result = std::make_shared(transpose, data1); + break; + } case 1: { + auto transpose = std::make_shared(data1, const_order); + result = std::make_shared(data0, transpose); + break; + } case 2: { + auto matmul = std::make_shared(data0, data1); + result = std::make_shared(matmul, const_order); + break; + } + } + return std::make_shared(NodeVector{result}, ParameterVector{data0, data1}); +} + +std::shared_ptr TransposeMatMulFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto const_order = std::make_shared(ov::element::i32, Shape {4}, std::vector{0, 2, 3, 1}); + auto transpose = std::make_shared(data1, const_order); + auto matmul = std::make_shared(data0, transpose); + return std::make_shared(NodeVector{matmul}, ParameterVector{data0, data1}); +} +std::shared_ptr TransposeMatMulBiasFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto data2 = std::make_shared(precision, input_shapes[2]); + auto const_order = std::make_shared(ov::element::i32, Shape {4}, std::vector{0, 2, 3, 1}); + auto transpose = std::make_shared(data1, const_order); + auto matmul = std::make_shared(data0, transpose); + auto bias = std::make_shared(matmul, data2); + return std::make_shared(NodeVector{bias}, ParameterVector{data0, data1, data2}); +} +std::shared_ptr TransposeMulMatMulBiasFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto data2 = std::make_shared(precision, input_shapes[2]); + auto data3 = std::make_shared(precision, input_shapes[3]); + auto const_order = std::make_shared(ov::element::i32, Shape {4}, std::vector{0, 2, 3, 1}); + auto transpose = std::make_shared(data1, const_order); + auto mul = std::make_shared(transpose, data2); + auto matmul = std::make_shared(data0, mul); + auto bias = std::make_shared(matmul, data3); + return std::make_shared(NodeVector{bias}, ParameterVector{data0, data1, data2, data3}); +} + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_mha.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_mha.cpp new file mode 100644 index 00000000000..ac38ea47624 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_mha.cpp @@ -0,0 +1,348 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_mha.hpp" + +#include "common_test_utils/data_utils.hpp" +#include +#include "ngraph_functions/builders.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::shared_ptr MHAFunction::initOriginal() const { + auto transpose0Param = std::make_shared(precision, input_shapes[0]); + auto transpose1Param = std::make_shared(precision, input_shapes[1]); + auto addParam = std::make_shared(precision, input_shapes[2]); + auto transpose2Param = std::make_shared(precision, input_shapes[3]); + ngraph::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; + + std::vector constantShapes; + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); + constantShapes.push_back(ov::Shape({2})); + constantShapes.push_back(ov::Shape({4})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + + auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector{0, 2, 1, 3}); + auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector{0, 2, 3, 1}); + auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector{0, 2, 1, 3}); + auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector{0, 2, 1, 3}); + + std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * + input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), + -1}; + auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData); + + std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), + static_cast(input_shapes[0].get_shape()[2]), + static_cast(input_shapes[0].get_shape()[1]), + static_cast(input_shapes[0].get_shape()[1])}; + auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData); + + float transA = false; + float transB = false; + const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); + const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + std::shared_ptr matmul_parent1 = transpose1; + if (with_mul) { + std::vector mulConstData(ngraph::shape_size(constantShapes[2])); + auto mulConst = ngraph::builder::makeConstant(precision, constantShapes[2], mulConstData, true); + matmul_parent1 = std::make_shared(transpose1, mulConst); + } + const auto matMul0 = std::make_shared(transpose0, matmul_parent1, transA, transB); + const auto add = std::make_shared(matMul0, addParam); + const auto reshape0 = std::make_shared(add, reshape0Const, true); + const auto softMax = std::make_shared(reshape0, 1); + const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); + const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto transpose3 = std::make_shared(matMul1, transpose3Const); + + ngraph::ResultVector results{std::make_shared(transpose3)}; + return std::make_shared(results, ngraphParam, "mha"); +} +std::shared_ptr MHAFunction::initReference() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto data2 = std::make_shared(precision, input_shapes[2]); + auto data3 = std::make_shared(precision, input_shapes[3]); + ngraph::ParameterVector ngraphParams = {data0, data1, data2, data3}; + NodeVector subgraph_inputs = {data0, data1, data2, data3}; + + auto transpose0Param = std::make_shared(precision, input_shapes[0]); + auto transpose1Param = std::make_shared(precision, input_shapes[1]); + auto addParam = std::make_shared(precision, input_shapes[2]); + auto transpose2Param = std::make_shared(precision, input_shapes[3]); + + std::vector constantShapes; + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); + constantShapes.push_back(ov::Shape({2})); + constantShapes.push_back(ov::Shape({4})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + + auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector{0, 2, 1, 3}); + auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector{0, 2, 3, 1}); + auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector{0, 2, 1, 3}); + auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector{0, 2, 1, 3}); + + ngraph::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param}; + + std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * + input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), + -1}; + auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData); + + std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), + static_cast(input_shapes[0].get_shape()[2]), + static_cast(input_shapes[0].get_shape()[1]), + static_cast(input_shapes[0].get_shape()[1])}; + auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData); + + float transA = false; + float transB = false; + const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); + const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + std::shared_ptr matmul_parent1 = transpose1; + if (with_mul) { + std::vector mulConstData(ngraph::shape_size(constantShapes[2])); + auto mulConst = ngraph::builder::makeConstant(precision, constantShapes[2], mulConstData, true); + auto mulParam = std::make_shared(precision, mulConst->get_shape()); + matmul_parent1 = std::make_shared(transpose1, mulParam); + subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param}; + subgraph_inputs = {data0, data1, mulConst, data2, data3}; + } + const auto matMul0 = std::make_shared(transpose0, matmul_parent1, transA, transB); + const auto add = std::make_shared(matMul0, addParam); + const auto reshape0 = std::make_shared(add, reshape0Const, true); + const auto softMax = std::make_shared(reshape0, 1); + const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); + const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto transpose3 = std::make_shared(matMul1, transpose3Const); + + auto subgraph = std::make_shared(subgraph_inputs, + std::make_shared(NodeVector{transpose3}, subgraph_params)); + + return std::make_shared(NodeVector{subgraph}, ngraphParams); +} + +std::shared_ptr MHAMatMul0TransposeFunction::initOriginal() const { + auto transpose0Param = std::make_shared(precision, input_shapes[0]); + auto transpose1Param = std::make_shared(precision, input_shapes[1]); + auto addParam = std::make_shared(precision, input_shapes[2]); + auto transpose2Param = std::make_shared(precision, input_shapes[3]); + ngraph::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; + + std::vector constantShapes; + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); + constantShapes.push_back(ov::Shape({2})); + constantShapes.push_back(ov::Shape({4})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + + const auto order = std::vector{0, 2, 1, 3}; + auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], order); + auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], order); + auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], order); + auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], order); + + std::vector mulConstData(1); + auto mulConst = ngraph::builder::makeConstant(precision, ov::Shape{1}, mulConstData, true); + + std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * + input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), + -1}; + auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData); + + std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), + static_cast(input_shapes[0].get_shape()[2]), + static_cast(input_shapes[0].get_shape()[1]), + static_cast(input_shapes[0].get_shape()[1])}; + auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData); + + float transA = false; + float transB = false; + const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); + const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + const auto mul = std::make_shared(transpose1, mulConst); + const auto matMul0 = std::make_shared(transpose0, mul, transA, true); + const auto add = std::make_shared(matMul0, addParam); + const auto reshape0 = std::make_shared(add, reshape0Const, true); + const auto softMax = std::make_shared(reshape0, 1); + const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); + const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto transpose3 = std::make_shared(matMul1, transpose3Const); + + ngraph::ResultVector results{std::make_shared(transpose3)}; + return std::make_shared(results, ngraphParam, "mha"); +} +std::shared_ptr MHAMatMul0TransposeFunction::initReference() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto data2 = std::make_shared(precision, input_shapes[2]); + auto data3 = std::make_shared(precision, input_shapes[3]); + ngraph::ParameterVector ngraphParams = {data0, data1, data2, data3}; + + auto transpose0Param = std::make_shared(precision, input_shapes[0]); + auto transpose1Param = std::make_shared(precision, input_shapes[1]); + auto addParam = std::make_shared(precision, input_shapes[2]); + auto transpose2Param = std::make_shared(precision, input_shapes[3]); + + std::vector constantShapes; + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); + constantShapes.push_back(ov::Shape({2})); + constantShapes.push_back(ov::Shape({4})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + + auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector{0, 2, 1, 3}); + auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector{0, 2, 3, 1}); + auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector{0, 2, 1, 3}); + auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector{0, 2, 1, 3}); + + std::vector mulConstData(1); + auto mulConst = ngraph::builder::makeConstant(precision, ov::Shape{1}, mulConstData, true); + ngraph::ParameterVector subgraphParams = {transpose0Param, transpose1Param, addParam, transpose2Param}; + + std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * + input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), + -1}; + auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData); + + std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), + static_cast(input_shapes[0].get_shape()[2]), + static_cast(input_shapes[0].get_shape()[1]), + static_cast(input_shapes[0].get_shape()[1])}; + auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData); + + float transA = false; + float transB = false; + const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); + const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + const auto mul = std::make_shared(transpose1, mulConst); + const auto matMul0 = std::make_shared(transpose0, mul, transA, transB); + const auto add = std::make_shared(matMul0, addParam); + const auto reshape0 = std::make_shared(add, reshape0Const, true); + const auto softMax = std::make_shared(reshape0, 1); + const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); + const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto transpose3 = std::make_shared(matMul1, transpose3Const); + + auto subgraph = std::make_shared( + NodeVector{data0, data1, data2, data3}, + std::make_shared(NodeVector{transpose3}, subgraphParams)); + + return std::make_shared(NodeVector{subgraph}, ngraphParams); +} + +std::shared_ptr MHASelectFunction::initOriginal() const { + auto transpose0Param = std::make_shared(precision, input_shapes[0]); + auto transpose1Param = std::make_shared(precision, input_shapes[1]); + auto addParam = std::make_shared(precision, input_shapes[2]); + auto less0Param = std::make_shared(precision, input_shapes[3]); + auto less1Param = std::make_shared(precision, input_shapes[4]); + auto transpose2Param = std::make_shared(precision, input_shapes[5]); + ngraph::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, less0Param, less1Param, transpose2Param}; + + std::vector constantShapes; + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1})); + constantShapes.push_back(ov::Shape({2})); + constantShapes.push_back(ov::Shape({4})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()})); + + auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], + std::vector{0, 2, 1, 3}); + auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], + std::vector{0, 2, 3, 1}); + auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], + std::vector{0, 2, 1, 3}); + auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], + std::vector{0, 2, 1, 3}); + + std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * + input_shapes[0].get_shape()[1] * + input_shapes[0].get_shape()[2]), + -1}; + auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData); + + std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), + static_cast(input_shapes[0].get_shape()[2]), + static_cast(input_shapes[0].get_shape()[1]), + static_cast(input_shapes[0].get_shape()[1])}; + auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData); + // Value is equal to '1' - to avoid situation e^(-1000) / (sum(e^(-1000)) = 0/0 = NAN + auto selectConst = ngraph::builder::makeConstant(precision, ov::Shape{1}, std::vector{1}); + + float transA = false; + float transB = false; + const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); + const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB); + const auto add = std::make_shared(matMul0, addParam); + const auto less = std::make_shared(less0Param, less1Param); + std::shared_ptr selectCond = less; + if (add->get_output_partial_shape(0) != input_shapes[3]) { + const auto broadcast_shape = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], + add->get_output_shape(0)); + const auto broadcast = ngraph::builder::makeBroadcast(selectCond, broadcast_shape, + ngraph::op::BroadcastType::NUMPY); + selectCond = broadcast; + } + const auto select = std::make_shared(selectCond, selectConst, add, + ngraph::op::AutoBroadcastType::NUMPY); + const auto reshape0 = std::make_shared(select, reshape0Const, true); + const auto softMax = std::make_shared(reshape0, 1); + const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); + const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto transpose3 = std::make_shared(matMul1, transpose3Const); + + // to generate valid values + less0Param->set_friendly_name("less0"); + less0Param->set_friendly_name("less1"); + + ngraph::ResultVector results{std::make_shared(transpose3)}; + return std::make_shared(results, ngraphParam, "mha"); +} + +std::shared_ptr MHAWOTransposeOnInputsFunction::initOriginal() const { + auto param0 = std::make_shared(precision, input_shapes[0]); + auto param1 = std::make_shared(precision, input_shapes[1]); + auto param2 = std::make_shared(precision, input_shapes[2]); + ngraph::ParameterVector ngraphParam = {param0, param1, param2}; + + auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape({4}), std::vector{0, 2, 1, 3}); + + float transA = false; + float transB = false; + const auto mulConst = ngraph::builder::makeConstant(precision, ov::Shape({1}), std::vector{1}, true); + const auto mul = std::make_shared(param1, mulConst); + const auto matMul0 = std::make_shared(param0, mul, transA, transB); + const auto softmax = std::make_shared(matMul0, 3); + const auto matMul1 = std::make_shared(softmax, param2, transA, transB); + const auto transpose3 = std::make_shared(matMul1, transpose3Const); + + ngraph::ResultVector results{std::make_shared(transpose3)}; + return std::make_shared(results, ngraphParam, "mha"); +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp index a8e7aa6aa76..6ad6a087e2e 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp @@ -28,15 +28,16 @@ std::shared_ptr AddFunction::initReference() const { } std::shared_ptr AddConstFunction::initOriginal() const { auto data0 = std::make_shared(precision, input_shapes[0]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.); - auto const_data1 = std::make_shared(precision, input_shapes[0], const_values); + const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0].get_shape()), -10., 10.); + auto const_data1 = std::make_shared(precision, input_shapes[0].get_shape(), const_values); auto add = std::make_shared(data0, const_data1); return std::make_shared(NodeVector{add}, ParameterVector{data0}); } std::shared_ptr AddRollConstFunction::initOriginal() const { - auto data0 = std::make_shared(precision, input_shapes[0]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.); - auto const_data1 = std::make_shared(precision, input_shapes[0], const_values); + const auto input_shape = input_shapes[0].get_shape(); + auto data0 = std::make_shared(precision, input_shape); + const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shape), -10., 10.); + auto const_data1 = std::make_shared(precision, input_shape, const_values); auto shift = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); auto axes = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{0}); auto roll0 = std::make_shared(data0, shift, axes); @@ -49,7 +50,7 @@ std::shared_ptr AddRollConstFunction::initOriginal() const { std::shared_ptr EltwiseFunction::initOriginal() const { auto data0 = std::make_shared(precision, input_shapes[0]); auto data1 = std::make_shared(precision, input_shapes[1]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[1]), -10., 10.); + const std::vector const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.); auto const_data = std::make_shared(precision, data1->get_shape(), const_values); auto add = std::make_shared(data0, data1); auto sub = std::make_shared(add, const_data); @@ -59,7 +60,7 @@ std::shared_ptr EltwiseFunction::initOriginal() const { std::shared_ptr EltwiseFunction::initReference() const { auto data0 = std::make_shared(precision, input_shapes[0]); auto data1 = std::make_shared(precision, input_shapes[1]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[1]), -10., 10.); + const std::vector const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.); auto const_data = std::make_shared(precision, data1->get_shape(), const_values); auto indata0 = std::make_shared(precision, data0->get_shape()); auto indata1 = std::make_shared(precision, data1->get_shape()); @@ -108,7 +109,9 @@ std::shared_ptr EltwiseMaxNumParamsFunction::initOriginal() const { std::shared_ptr MatMulEltwiseBranchesFunction::initOriginal() const { auto data_1 = std::make_shared(precision, input_shapes[0]); auto data_2 = std::make_shared(precision, input_shapes[1]); - auto non_snippet_op = std::make_shared(data_1, data_2); + auto sinh_1 = std::make_shared(data_1); + auto sinh_2 = std::make_shared(data_2); + auto non_snippet_op = std::make_shared(sinh_1, sinh_2); const std::vector const_values = CommonTestUtils::generate_float_numbers(4, -10., 10.); auto mul_const_1 = op::v0::Constant::create(precision, {1}, {const_values[0]}); auto mul_1 = std::make_shared(non_snippet_op, mul_const_1); @@ -131,17 +134,19 @@ std::shared_ptr MatMulEltwiseBranchesFunction::initOriginal() const { std::shared_ptr MatMulEltwiseBranchesFunction::initReference() const { auto data_1 = std::make_shared(precision, input_shapes[0]); auto data_2 = std::make_shared(precision, input_shapes[1]); + auto sinh_1 = std::make_shared(data_1); + auto sinh_2 = std::make_shared(data_2); const std::vector const_values = CommonTestUtils::generate_float_numbers(4, -10., 10.); // snippet inputs - auto non_snippet_op = std::make_shared(data_1, data_2); + auto non_snippet_op = std::make_shared(sinh_1, sinh_2); auto mul_const_1 = std::make_shared(precision, Shape{1}, const_values[0]); auto add_const_1 = std::make_shared(precision, Shape{1}, const_values[1]); auto mul_const_2 = std::make_shared(precision, Shape{1}, const_values[2]); auto sub_const_2 = std::make_shared(precision, Shape{1}, const_values[3]); // snippet function - Shape matMulOutShape = input_shapes[0]; - matMulOutShape.back() = input_shapes[1].back(); + Shape matMulOutShape = input_shapes[0].get_shape(); + matMulOutShape.back() = input_shapes[1].get_shape().back(); auto snippet_input = std::make_shared(precision, matMulOutShape); auto mul_1 = std::make_shared(snippet_input, mul_const_1); @@ -272,6 +277,37 @@ std::shared_ptr TwoInputsAndOutputsFunction::initOriginal() const { return std::make_shared(NodeVector{hswish, sin3}, ParameterVector{data0, data1}); } +std::shared_ptr SelectFunction::initOriginal() const { + auto data0 = std::make_shared(ov::element::boolean, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto data2 = std::make_shared(precision, input_shapes[2]); + auto select = std::make_shared(data0, data1, data2); + + return std::make_shared(NodeVector{select}, ParameterVector{data0, data1, data2}); +} + +std::shared_ptr BroadcastAddFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto target_shape = std::make_shared(ov::element::i32, ov::Shape{m_target_shape.size()}, m_target_shape.get_shape()); + auto broadcast = std::make_shared(data0, target_shape); + auto add = std::make_shared(broadcast, data1); + + return std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); +} + + +std::shared_ptr BroadcastSelectFunction::initOriginal() const { + auto data0 = std::make_shared(ov::element::boolean, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto data2 = std::make_shared(precision, input_shapes[2]); + auto target_shape = std::make_shared(ov::element::i32, ov::Shape{m_target_shape.size()}, m_target_shape.get_shape()); + auto broadcast = std::make_shared(data0, target_shape); + auto select = std::make_shared(broadcast, data1, data2); + + return std::make_shared(NodeVector{select}, ParameterVector{data0, data1, data2}); +} + } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp new file mode 100644 index 00000000000..fb692734ebd --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp @@ -0,0 +1,52 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_softmax.hpp" +#include "common_test_utils/data_utils.hpp" +#include +#include "ngraph_functions/builders.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::shared_ptr SoftmaxFunction::initOriginal() const { + auto data = std::make_shared(precision, input_shapes[0]); + auto softmax = std::make_shared(data, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data}); +} + +std::shared_ptr AddSoftmaxFunction::initOriginal() const { + auto data0 = std::make_shared(precision, input_shapes[0]); + auto data1 = std::make_shared(precision, input_shapes[1]); + auto add = std::make_shared(data0, data1); + auto softmax = std::make_shared(add, axis); + return std::make_shared(NodeVector{softmax}, ParameterVector{data0, data1}); +} + +std::shared_ptr TransposeSoftmaxFunction::initOriginal() const { + const auto transpose0Param = std::make_shared(precision, input_shapes[0]); + const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{m_order.size()}, m_order); + const auto transpose2 = std::make_shared(transpose0Param, transpose0Const); + const auto softMax = std::make_shared(transpose2, m_axis); + return std::make_shared(ov::NodeVector{softMax}, ov::ParameterVector {transpose0Param}, "softmax_transpose"); +} + +std::shared_ptr TransposeSoftmaxEltwiseFunction::initOriginal() const { + const auto transpose0Param = std::make_shared(precision, input_shapes[0]); + const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{m_order.size()}, + m_order); + const auto transpose2 = std::make_shared(transpose0Param, transpose0Const); + const auto mulConst = ngraph::builder::makeConstant(ngraph::element::f32, transpose2->get_shape(), + std::vector{}, true); + const auto mul = std::make_shared(transpose2, mulConst); + const auto softMax = std::make_shared(mul, m_axis); + const auto hswish = std::make_shared(softMax); + return std::make_shared(ov::NodeVector{hswish}, ov::ParameterVector{transpose0Param}, + "softmax_transpose"); +} + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_transpose.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_transpose.cpp new file mode 100644 index 00000000000..dcfb04a74d9 --- /dev/null +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_transpose.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_transpose.hpp" +#include "common_test_utils/data_utils.hpp" +#include + +namespace ov { +namespace test { +namespace snippets { +std::shared_ptr TransposeFunction::initOriginal() const { + auto data = std::make_shared(precision, input_shapes[0]); + auto const_order = std::make_shared(ov::element::i32, Shape {order.size()}, order); + auto transpose = std::make_shared(data, const_order); + return std::make_shared(NodeVector{transpose}, ParameterVector{data}); +} +std::shared_ptr TransposeFunction::initReference() const { + auto data = std::make_shared(precision, input_shapes[0]); + auto const_order = std::make_shared(ov::element::i32, Shape {order.size()}, order); + auto indata0 = std::make_shared(precision, data->get_output_partial_shape(0)); + auto indata1 = std::make_shared(const_order->get_output_element_type(0), + const_order->get_output_partial_shape(0)); + auto transpose = std::make_shared(NodeVector{data, const_order}, + std::make_shared(NodeVector{std::make_shared(indata0, indata1)}, + ParameterVector{indata0, indata1})); + return std::make_shared(NodeVector{transpose}, ParameterVector{data}); +} + +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file