diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp index 27a7d4408b9..9ecf82d3893 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp @@ -34,6 +34,8 @@ struct memory { virtual void unlock(const stream& stream) = 0; virtual event::ptr fill(stream& stream, unsigned char pattern) = 0; virtual event::ptr fill(stream& stream) = 0; + // only supports gpu_usm + virtual void* buffer_ptr() const { return nullptr; } size_t size() const { return _bytes_count; } size_t count() const { return _layout.count(); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp index c86f79d4baa..db3b0e84cf0 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp @@ -36,39 +36,26 @@ void basic_memory_dependencies::run(program& p) { add_memory_dependency(it, node); } - if (node->is_type() && node->get_preferred_impl_type() == impl_types::onednn) { - auto& conv = node->as(); - bool can_reuse_eltwise_mem = false; + if (node->get_preferred_impl_type() == impl_types::onednn + && (node->is_type() || node->is_type())) { size_t eltw_dep = 0; - - for (auto& fused_op : conv.get_fused_primitives()) { + for (auto& fused_op : node->get_fused_primitives()) { if (fused_op.node->is_type() && fused_op.deps.size() == 1) { - auto eltw_in_layout = conv.get_dependency(fused_op.dep_start_idx).get_output_layout(); - auto conv_out_layout = node->get_output_layout(); - if (eltw_dep > 0) { - can_reuse_eltwise_mem = false; - break; - } + // If it is first sum, reuse the buffer + auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op); + if (fusing_type != add_fusing_type::sum || eltw_dep != 0) + continue; - if (eltw_in_layout.size == conv_out_layout.size && - eltw_in_layout.format == conv_out_layout.format && - eltw_in_layout.data_padding == conv_out_layout.data_padding && - data_type_traits::size_of(eltw_in_layout.data_type) == data_type_traits::size_of(conv_out_layout.data_type)) { - eltw_dep = fused_op.dep_start_idx; - can_reuse_eltwise_mem = true; + eltw_dep = fused_op.dep_start_idx; + auto& eltw_node = node->get_dependency(eltw_dep); + eltw_node.can_share_buffer(false); + node->can_share_buffer(false); + for (auto& user : node->get_users()) { + add_memory_dependency(user, &eltw_node); + add_memory_dependency(user, node); } } } - - if (can_reuse_eltwise_mem) { - auto& eltw_node = conv.get_dependency(eltw_dep); - eltw_node.can_share_buffer(false); - conv.can_share_buffer(false); - for (auto& user : conv.get_users()) { - add_memory_dependency(user, &eltw_node); - add_memory_dependency(user, &conv); - } - } } // Note we iterate over processing order, it means if primitve has processing num greater than any of outputs, diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 0e90e92b0d2..f433aa54ce0 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -93,16 +93,11 @@ bool concat_in_place_optimization::match(concatenation_node& node) { for (auto& input : node.get_dependencies()) { if (input->get_preferred_impl_type() == impl_types::onednn) { for (auto& fused_op : input->get_fused_primitives()) { - if (fused_op.node->is_type() && fused_op.deps.size() == 1) { - auto& eltw_in = input->get_dependency(fused_op.dep_start_idx); - auto eltw_in_layout = eltw_in.get_output_layout(); - auto out_layout = input->get_output_layout(); - - if (!program_helpers::needs_onednn_sum_post_op(fused_op.node->as(), eltw_in_layout)) - continue; - if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout)) - return false; - } + auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(*input, fused_op); + if (add_type == add_fusing_type::sum) + return false; + else + continue; } is_onednn_impl = true; } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 5fe21f21682..f9f50b56cf0 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -649,44 +649,38 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) // When the conv node is of onednn impl type and eltwise sum with full tensor is fused, // changes the input format of eltwise sum post-op to use binary add. if (conv_node.get_preferred_impl_type() == impl_types::onednn) { - std::vector eltw_sum_dep_indices; - for (size_t i = 1; i < conv_node.get_dependencies().size(); i++) { - auto& dep = conv_node.get_dependency(i); - for (auto& fused_op : conv_node.get_fused_primitives()) { - if (fused_op.node->is_type() - && fused_op.node->as().get_primitive()->mode == eltwise_mode::sum - && !program_helpers::needs_onednn_sum_post_op(fused_op.node->as(), - conv_node.get_dependency(fused_op.dep_start_idx).get_output_layout()) - && conv_node.get_dependency(fused_op.dep_start_idx).get_users().size() == 1 - && conv_node.get_dependency(fused_op.dep_start_idx).id() == dep.id()) { - eltw_sum_dep_indices.push_back(i); - } - } - } + onednn_add_fusing_helpers::for_eltwise(conv_node, eltwise_mode::sum, + [&](const program_node& p_node, const eltwise_node& e_node, const fused_primitive_desc& desc) { + auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(p_node, desc); + if (fusing_type == add_fusing_type::binary_per_tensor) { + auto& dep_node = p_node.get_dependency(desc.dep_start_idx); + auto d_layout = dep_node.get_output_layout(); + auto d_format = d_layout.format; + auto expected_format = format::any; - auto conv_layout = conv_node.get_output_layout(); - for (auto& dep_id : eltw_sum_dep_indices) { - auto& prev_node = conv_node.get_dependency(dep_id); - auto old_layout = prev_node.get_output_layout(); - auto expected_format = format::any; - if ((conv_layout.data_type == data_types::f16 || conv_layout.data_type == data_types::f32) - && data_type_traits::is_i8_u8(old_layout.data_type)) { - if (conv_layout.format == format::b_fs_yx_fsv16) - expected_format = format::b_fs_yx_fsv32; - if (conv_layout.format == format::bs_fs_yx_bsv32_fsv16) - expected_format = format::bs_fs_yx_bsv32_fsv32; - } + if (data_type_traits::is_i8_u8(d_layout.data_type)) { + if (d_format == format::b_fs_yx_fsv16) + expected_format = format::b_fs_yx_fsv32; + else if (d_format == format::bs_fs_yx_bsv32_fsv16) + expected_format = format::bs_fs_yx_bsv32_fsv32; + } else if (data_type_traits::is_floating_point(d_layout.data_type)) { + if (d_format == format::b_fs_yx_fsv32) + expected_format = format::b_fs_yx_fsv16; + else if (d_format == format::bs_fs_yx_bsv32_fsv32) + expected_format = format::bs_fs_yx_bsv32_fsv16; + } - if (expected_format != format::any && old_layout.format != expected_format) { - auto new_layout = old_layout; - new_layout.format = expected_format; - auto new_input = rf.get_reorder(prev_node.id(), old_layout, new_layout); - if (new_input.first) { - p.add_intermediate(new_input.first, conv_node, dep_id, !new_input.second); + if (expected_format != format::any && d_layout.format != expected_format) { + auto new_layout = d_layout; + new_layout.format = expected_format; + auto new_input = rf.get_reorder(dep_node.id(), d_layout, new_layout); + if (new_input.first) { + p.add_intermediate(new_input.first, conv_node, desc.dep_start_idx, !new_input.second); + } + conv_node.get_dependency(desc.dep_start_idx).set_output_layout(new_layout, false); + } } - conv_node.get_dependency(dep_id).set_output_layout(new_layout, false); - } - } + }); } }; diff --git a/src/plugins/intel_gpu/src/graph/include/program_helpers.h b/src/plugins/intel_gpu/src/graph/include/program_helpers.h index 5794c00bf37..dee11d8d88f 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_helpers.h +++ b/src/plugins/intel_gpu/src/graph/include/program_helpers.h @@ -11,10 +11,12 @@ #include "intel_gpu/graph/program.hpp" #include "data_inst.h" #include "eltwise_inst.h" +#include "convolution_inst.h" #include #include #include +#include namespace cldnn { struct program_helpers { @@ -125,12 +127,35 @@ struct program_helpers { } } static layout get_weights_layout(typed_program_node& data_node, int32_t split); - - static bool are_layouts_identical_for_onednn_sum_post_op(layout input_layout, layout output_layout); - - static bool needs_onednn_sum_post_op(const eltwise_node& n, layout input_layout); }; +struct onednn_add_fusing_helpers { + enum class add_fusing_type { + sum, + binary_per_tensor, + binary_per_oc, + not_supported, + }; + + static bool is_full_tensor(const layout& layout); + static std::vector get_fused_eltwise_primitives(); + static void for_eltwise(const program_node& conv_node, eltwise_mode mode, + std::function func); + static add_fusing_type get_add_fusing_type(const program_node& node, const fused_primitive_desc& desc); +}; + +using add_fusing_type = onednn_add_fusing_helpers::add_fusing_type; + +static inline std::ostream& operator<< (std::ostream& os, add_fusing_type& t) { + switch (t) { + case add_fusing_type::sum: os << "sum"; break; + case add_fusing_type::binary_per_tensor: os << "binary_per_tensor"; break; + case add_fusing_type::binary_per_oc: os << "binary_per_oc"; break; + default: os << "not_supported"; break; + } + return os; +} + // Base class for performing pattern match style optimizations. // Uses CRTP idiom, implementing class should be passed as template parameter `Impl`, // and overload match and optimize methods. diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index bc0d0c0607f..a8380c867bc 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -56,6 +56,30 @@ enum class onednn_post_op_type : uint32_t { optimized_sum }; +static inline std::ostream& operator<< (std::ostream& os, onednn_post_op_type& t) { + switch (t) { + case onednn_post_op_type::eltwise_act: os << "eltwise_act"; break; + case onednn_post_op_type::eltwise_clip: os << "eltwise_clip"; break; + case onednn_post_op_type::eltwise_linear: os << "eltwise_linear"; break; + case onednn_post_op_type::eltwise_round: os << "eltwise_round"; break; + case onednn_post_op_type::binary_mul: os << "binary_mul"; break; + case onednn_post_op_type::binary_add: os << "binary_add"; break; + case onednn_post_op_type::binary_max: os << "binary_max"; break; + case onednn_post_op_type::binary_min: os << "binary_min"; break; + case onednn_post_op_type::binary_relu: os << "binary_relu"; break; + case onednn_post_op_type::scale: os << "scale"; break; + case onednn_post_op_type::sum: os << "sum"; break; + case onednn_post_op_type::optimized: os << "optimized"; break; + case onednn_post_op_type::optimized_eltwise_act: os << "optimized_eltwise_act"; break; + case onednn_post_op_type::optimized_eltwise_clip: os << "optimized_eltwise_clip"; break; + case onednn_post_op_type::optimized_eltwise_linear: os << "optimized_eltwise_linear"; break; + case onednn_post_op_type::optimized_eltwise_round: os << "optimized_eltwise_round"; break; + case onednn_post_op_type::optimized_sum: os << "optimized_sum"; break; + default: os << "invalid"; + } + return os; +} + struct fused_primitive_desc_onednn { onednn_post_op_type op_type; // onednn post-operation type size_t mem_offset; // index of a memory buffer for current post-operation diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 81c3354194a..80947227651 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -198,12 +198,11 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, // Not to fuse reorder if this removal changes input format of its next node which has reuse in fused_op if (next.get_preferred_impl_type() == impl_types::onednn) { for (auto& fused_op : next.get_fused_primitives()) { - if (fused_op.node->is_type() && fused_op.deps.size() == 1) { + if (fused_op.node->is_type()) { auto eltw_in_layout = next.get_dependency(fused_op.dep_start_idx).get_output_layout(); auto out_layout = next.get_output_layout(); - if (program_helpers::needs_onednn_sum_post_op(fused_op.node->as(), eltw_in_layout) && - program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout) && - prev.get_output_layout().format != out_layout.format) + auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(next, fused_op); + if (add_type == add_fusing_type::sum && prev.get_output_layout().format != out_layout.format) return false; } } @@ -947,23 +946,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, bool use_onednn_impls = _optimization_attributes.use_onednn_impls && input_layout.data_type != data_types::f32; bool i8_u8_input = input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8; - if (use_onednn_impls && onednn_valid_post_ops) { - for (auto& fo : node.get_fused_primitives()) { - if (fo.node->is_type()) { - auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout(); - auto out_layout = node.get_output_layout(); - auto in_dt = in_layout.data_type; - auto out_dt = out_layout.data_type; - if ((out_layout.count() == in_layout.count()) && - (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt && - program_helpers::needs_onednn_sum_post_op(fo.node->as(), in_layout)) { - onednn_valid_post_ops = false; - break; - } - } - } - } - if (use_onednn_impls && onednn_valid_post_ops) { std::function has_any_convolutions_below; has_any_convolutions_below = [&](const program_node& node) -> bool { @@ -1373,23 +1355,6 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format impl_candidate = impl_types::ocl; } - // [WA] to avoid an onednn kernel issue of multiple sum post-ops - if (!node.get_fused_primitives().empty()) { - size_t sum_post_op_cnt = 0; - for (auto& fused_op : node.get_fused_primitives()) { - if (fused_op.node->is_type() && node.get_dependencies().size() > fused_op.dep_start_idx && fused_op.deps.size() == 1) { - auto& eltw_in = node.get_dependency(fused_op.dep_start_idx); - if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in.get_output_layout(), node.get_output_layout()) && - program_helpers::needs_onednn_sum_post_op(fused_op.node->as(), eltw_in.get_output_layout())) { - if (sum_post_op_cnt > 0) - return impl_types::ocl; - - sum_post_op_cnt += 1; - } - } - } - } - if (node.is_type()) { // oneDNN doesn't have good support for groups with fsv16 fmt auto& conv = node.as(); @@ -1418,29 +1383,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format impl_candidate = impl_types::ocl; } - size_t eltw_dep = 0; for (auto& fo : node.get_fused_primitives()) { - if (fo.node->is_type()) { - auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout(); - auto out_layout = node.get_output_layout(); - auto in_dt = in_layout.data_type; - auto out_dt = out_layout.data_type; - if (program_helpers::needs_onednn_sum_post_op(fo.node->as(), in_layout)) { - if ((out_layout.count() == in_layout.count()) && - (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) { - impl_candidate = impl_types::ocl; - break; - } - if (in_layout.size == out_layout.size && in_layout.format == out_layout.format && in_layout.data_padding == out_layout.data_padding && - data_type_traits::size_of(in_dt) == data_type_traits::size_of(out_dt)) { - if (eltw_dep > 0) { - impl_candidate = impl_types::ocl; - break; - } - eltw_dep = fo.dep_start_idx; - } - } - } else if (fo.node->is_type()) { + if (fo.node->is_type()) { // Some activations aren't implemented in oneDNN auto activation_prim = fo.node->as().get_primitive(); if (activation_prim->activation_function == activation_func::negative || @@ -1486,15 +1430,17 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format auto out_layout = node.get_output_layout(); auto in_dt = in_layout.data_type; auto out_dt = out_layout.data_type; - if ((out_layout.count() == in_layout.count()) && - (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt && - program_helpers::needs_onednn_sum_post_op(fo.node->as(), in_layout)) { + // if it is not eltwise sum and input is full tensor + if ((out_layout.count() == in_layout.count()) && in_dt != out_dt + && (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) + && onednn_add_fusing_helpers::is_full_tensor(in_layout)) { impl_candidate = impl_types::ocl; break; } - if (fo.node->as().get_primitive()->mode == eltwise_mode::sum && - program_helpers::needs_onednn_sum_post_op(fo.node->as(), in_layout)) { + // WA: onednn sum/binary_add post-op are not supported due to perf drop. + auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(node, fo); + if (add_type == add_fusing_type::sum || add_type == add_fusing_type::binary_per_tensor || add_type == add_fusing_type::binary_per_oc) { impl_candidate = impl_types::ocl; break; } diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index b4fb14c6bc6..4cd38a00eaa 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -536,54 +536,22 @@ void network::allocate_primitives() { for (auto const& node : _program->get_processing_order()) { if (node->get_preferred_impl_type() == impl_types::onednn) { - bool can_reuse_eltwise_mem = false; size_t eltw_dep = 0; - for (auto& fused_op : node->get_fused_primitives()) { if (fused_op.node->is_type() && fused_op.deps.size() == 1) { - auto& eltw_in = node->get_dependency(fused_op.dep_start_idx); - auto eltw_in_layout = eltw_in.get_output_layout(); - auto out_layout = node->get_output_layout(); - - if (!program_helpers::needs_onednn_sum_post_op(fused_op.node->as(), eltw_in_layout)) + // If it is first sum, reuse the buffer + auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op); + if (fusing_type != add_fusing_type::sum || eltw_dep != 0) continue; - - if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout)) { - if (eltw_dep > 0) - throw std::runtime_error("Unsupported multiple full size tensors."); - - eltw_dep = fused_op.dep_start_idx; - can_reuse_eltwise_mem = true; + eltw_dep = fused_op.dep_start_idx; + auto& eltw_in = node->get_dependency(eltw_dep); + if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) { + auto& eltw_inst = _primitives.at(eltw_in.id()); + auto& prim_inst = _primitives.at(node->id()); + auto& eltw_mem = eltw_inst->output_memory(); + auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout()); + prim_inst->set_output_memory(new_mem); } - - if (!can_reuse_eltwise_mem) { - if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) { - auto& eltw_inst = _primitives.at(eltw_in.id()); - auto& prim_inst = _primitives.at(node->id()); - auto eltw_mem_type = eltw_inst->output_memory().get_allocation_type(); - auto prim_mem_type = prim_inst->output_memory().get_allocation_type(); - - // Keep lockable memory type for `prim_inst` output if needed - if (eltw_mem_type != prim_mem_type && eltw_mem_type != allocation_type::cl_mem && eltw_mem_type != allocation_type::usm_host) - can_reuse_eltwise_mem = false; - } - } - - if (program_helpers::needs_onednn_sum_post_op(fused_op.node->as(), eltw_in_layout) && !can_reuse_eltwise_mem) { - throw std::runtime_error("Buffer reuse is required for onednn sum post operation."); - } - } - } - - if (can_reuse_eltwise_mem) { - auto& eltw_in = node->get_dependency(eltw_dep); - if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) { - auto& eltw_inst = _primitives.at(eltw_in.id()); - auto& prim_inst = _primitives.at(node->id()); - auto& eltw_mem = eltw_inst->output_memory(); - auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout()); - - prim_inst->set_output_memory(new_mem); } } } @@ -698,8 +666,21 @@ void network::execute_impl(const std::vector& events) { } GPU_DEBUG_IF(debug_config->verbose >= 1) { + std::ostringstream in_addr; + // buffer_ptr() only support usm_memory + for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) { + auto& in_mem = get_primitive(inst->id())->dep_memory(i); + in_addr << in_mem.buffer_ptr(); + if (i < get_primitive(inst->id())->dependencies().size() - 1) { + in_addr << ", "; + } + } + auto& out_mem = get_primitive(inst->id())->output_memory(); + GPU_DEBUG_COUT << "Execute " << inst->id() << ", memory type: " - << inst->output_memory().get_allocation_type() << std::endl; + << inst->output_memory().get_allocation_type() << ", in_usm(" + << in_addr.str() << "), out_usm(" + << out_mem.buffer_ptr() << ")" << std::endl; } // If a node has mutable input or it's an output, then the input/output buffers might be changed diff --git a/src/plugins/intel_gpu/src/graph/program_helpers.cpp b/src/plugins/intel_gpu/src/graph/program_helpers.cpp index 8cfa0ca5e39..be3b3242026 100644 --- a/src/plugins/intel_gpu/src/graph/program_helpers.cpp +++ b/src/plugins/intel_gpu/src/graph/program_helpers.cpp @@ -7,9 +7,11 @@ #include "program_helpers.h" #include "intel_gpu/graph/program.hpp" #include "data_inst.h" +#include "pooling_inst.h" #include #include #include +#include namespace cldnn { // helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization @@ -181,28 +183,52 @@ std::pair program_helpers::are_layouts_identical(layout const& l1, l return {false, false}; } -// check if input and output layouts are identical to reuse memory in fused_ops of onednn -bool program_helpers::are_layouts_identical_for_onednn_sum_post_op(layout input_layout, layout output_layout) { - if (input_layout.size == output_layout.size && input_layout.format == output_layout.format && - input_layout.data_padding == output_layout.data_padding && - data_type_traits::size_of(input_layout.data_type) == data_type_traits::size_of(output_layout.data_type)) - return true; - - return false; -} - -bool program_helpers::needs_onednn_sum_post_op(const eltwise_node& n, layout input_layout) { - auto output_layout = n.get_output_layout(); - if (n.get_primitive()->mode == eltwise_mode::sum && - (input_layout.size.spatial[0] > 1 || input_layout.size.spatial[1] > 1 || input_layout.size.batch[0] > 1) - && output_layout.data_type == input_layout.data_type) { +bool onednn_add_fusing_helpers::is_full_tensor(const layout& l) { + if (l.size.spatial[0] > 1 || l.size.spatial[1] > 1 || (l.get_spatial_rank() == 3 && l.size.spatial[2] > 1) + || l.size.batch[0] > 1) { return true; } - return false; } +void onednn_add_fusing_helpers::for_eltwise( + const program_node& node, eltwise_mode mode, + std::function func) { + for (auto& fo : node.get_fused_primitives()) { + if (fo.node->is_type() && fo.node->as().get_primitive()->mode == mode) { + func(node, fo.node->as(), fo); + } + } +} +add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type( + const program_node& p_node, const fused_primitive_desc& desc) { + if (!desc.node->is_type() || desc.node->as().get_primitive()->mode != eltwise_mode::sum) { + return add_fusing_type::not_supported; + } + + auto& eltw_node = desc.node->as(); + auto& dep_node = p_node.get_dependency(desc.dep_start_idx); + + auto p_layout = p_node.get_output_layout(); + auto e_layout = eltw_node.get_output_layout(); + auto d_layout = dep_node.get_output_layout(); + + if (is_full_tensor(p_layout) && is_full_tensor(d_layout)) { + if (data_type_traits::size_of(p_layout.data_type) == data_type_traits::size_of(d_layout.data_type) + && p_layout.format == d_layout.format && p_layout.size == d_layout.size + && p_layout.data_padding == d_layout.data_padding + && dep_node.get_users().size() == 1 + && !p_node.is_type()) { + return add_fusing_type::sum; + } else if (p_layout.size == d_layout.size) { + return add_fusing_type::binary_per_tensor; + } + } + + return add_fusing_type::binary_per_oc; +} } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 71a8beb99c7..fd818bcd472 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -7,6 +7,7 @@ #include "primitive_inst.h" #include "loop_inst.h" #ifdef ENABLE_ONEDNN_FOR_GPU +#include "intel_gpu/runtime/debug_configuration.hpp" #include "convolution_inst.h" #include "quantize_inst.h" #include "reorder_inst.h" @@ -345,6 +346,8 @@ bool program_node::has_out_scales(const std::shared_ptr& a dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr& attr, bool& optimization_is_completed) { + GPU_DEBUG_GET_INSTANCE(debug_config); + // Create new dnnl::post_ops object which will be filled inside the optimization process dnnl::post_ops optimized_p_ops; @@ -393,6 +396,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const float scale; dnnl::memory::data_type data_type; cur_p_ops.get_params_sum(idx, scale, data_type); + // Only conv supports data type specification in append_sum. Other primitives(deconv, fc) do not support it. if (is_type()) { new_p_ops.append_sum(scale, data_type); } else { @@ -419,7 +423,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const // Check that post-op type is any optimized auto type_is_any_optimized = [](onednn_post_op_type type) -> bool { - return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum || + return type == onednn_post_op_type::optimized || + type == onednn_post_op_type::optimized_sum || type == onednn_post_op_type::optimized_eltwise_act || type == onednn_post_op_type::optimized_eltwise_linear || type == onednn_post_op_type::optimized_eltwise_clip || @@ -462,20 +467,45 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const } }; + auto remove_optimized_prefix = [&](std::vector& post_ops) { + // Check and update post-op map if we already optimized something + auto iter = post_ops.begin(); + while (iter != post_ops.end()) { + if (type_is_optimized_sum(iter->op_type)) { + iter->op_type = onednn_post_op_type::sum; + ++iter; + } else if (type_is_optimized_eltwise(iter->op_type)) { + iter->op_type = get_eltwise_type(iter->op_type); + ++iter; + } else if (type_is_optimized(iter->op_type)) { + iter = post_ops.erase(iter); + } else { + ++iter; + } + } + }; + auto& cur_post_ops = get_fused_primitives_onednn(); size_t cur_post_op_idx = 1; size_t prev_post_op_idx = 0; bool optimization_done = false; - // Check and update post-op map if we already optimized something - for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) { - if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type)) - cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum; - else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type)) - cur_post_ops[post_op_idx].op_type = get_eltwise_type(cur_post_ops[post_op_idx].op_type); - else if (type_is_optimized(cur_post_ops[post_op_idx].op_type)) - cur_post_ops.erase(cur_post_ops.begin() + post_op_idx); + GPU_DEBUG_IF(debug_config->verbose >= 3) { + GPU_DEBUG_COUT << "================================================" << std::endl; + GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl; + for (size_t i = 0; i < cur_post_ops.size(); i++) + GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl; + } + + remove_optimized_prefix(cur_post_ops); + + GPU_DEBUG_IF(debug_config->verbose >= 3) { + GPU_DEBUG_COUT << "remove optimized prefix ------------------------" << std::endl; + GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl; + for (size_t i = 0; i < cur_post_ops.size(); i++) + GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl; + GPU_DEBUG_COUT << "----------------------------------->>>>>>>>>>>>>" << std::endl; } // Get post-ops size for current node @@ -498,6 +528,9 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const auto cur_type = cur_post_ops[cur_post_op_idx].op_type; auto prev_type = cur_post_ops[prev_post_op_idx].op_type; + GPU_DEBUG_IF(debug_config->verbose >= 3) + GPU_DEBUG_COUT << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl; + // Ignore optimized operations for "previous" operation in our operation pair while (type_is_any_optimized(prev_type) && prev_post_op_idx < post_ops_size - 1) { prev_post_op_idx++; @@ -513,9 +546,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const cur_type = cur_post_ops[cur_post_op_idx].op_type; } + GPU_DEBUG_IF(debug_config->verbose >= 3) + GPU_DEBUG_COUT << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl; + auto cur_idx = static_cast(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx); auto prev_idx = static_cast(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx); + // if 2 indices are same, add the last post-op to dnnl::post_ops + if (prev_idx == post_ops_size - 1 && prev_idx == cur_idx && !type_is_any_optimized(prev_type)) { + add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx); + break; + } + // If this is the last pair and it's optimized - add the last post-op and go out from the cycle if (cur_post_op_idx == post_ops_size - 1 && (type_is_any_optimized(cur_type) || type_is_any_optimized(prev_type))) { if (!type_is_any_optimized(prev_type)) { @@ -542,6 +584,11 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const bool cur_ops_pair_is_optimized = false; + GPU_DEBUG_IF(debug_config->verbose >= 3) { + GPU_DEBUG_COUT << "prev_idx: " << prev_idx << " " << prev_type + << ", cur_idx: " << cur_idx << " " << cur_type << std::endl; + } + if (can_try_optimize) { if (eltw_and_eltw) { dnnl::algorithm cur_alg, prev_alg; @@ -701,6 +748,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const dnnl::post_ops eltw_p_op_prev, sum_p_op; eltw_p_op_prev.append_eltwise(eltw_scale * next_alpha * next_scale, alg, alpha, beta); + // Only conv supports data type specification in append_sum. Other primitives(deconv, fc) do not support it. if (is_type()) { sum_p_op.append_sum(sum_scale * next_alpha, data_type); } else { @@ -769,7 +817,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const } } + // if optimization_is_completed is true, try to optimize again. optimization_is_completed = !optimization_is_completed; + if (optimization_is_completed) { + remove_optimized_prefix(cur_post_ops); + } + + GPU_DEBUG_IF(debug_config->verbose >= 3) { + GPU_DEBUG_COUT << ">>>>>>>>>>>>>-----------------------------------" << std::endl; + for (size_t i = 0; i < cur_post_ops.size(); i++) + GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl; + GPU_DEBUG_COUT << "------------------------------------------------" << std::endl; + } add_onednn_fused_primitives(cur_post_ops); @@ -805,6 +864,7 @@ void program_node::init_onednn_primitive_attributes() { memory_offset++; }; + int32_t num_sum_post_ops = 0; for (size_t idx = 0; idx < cldnn_post_ops.size(); idx++) { auto node = cldnn_post_ops[idx].node; @@ -834,13 +894,11 @@ void program_node::init_onednn_primitive_attributes() { auto in = get_dependency(dep_idx).get_output_layout(); if (e_node.get_primitive()->mode == eltwise_mode::sum) { - if (program_helpers::needs_onednn_sum_post_op(e_node, in)) { - if (is_type()) { - post_ops.append_sum(1.0f, onednn::convert_data_type(in.data_type)); - } else { - post_ops.append_sum(1.0f); - } + auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*this, cldnn_post_ops[idx]); + if (fusing_type == add_fusing_type::sum && num_sum_post_ops == 0) { + post_ops.append_sum(1.0f); update_onednn_post_op_list(onednn_post_op_type::sum, dep_idx); + num_sum_post_ops++; } else { dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in); post_ops.append_binary(dnnl::algorithm::binary_add, in_desc); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp index 8f170aa6bfa..ecbe9257bc4 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp @@ -107,6 +107,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory { void unlock(const stream& stream) override; const cl::UsmMemory& get_buffer() const { return _buffer; } cl::UsmMemory& get_buffer() { return _buffer; } + void* buffer_ptr() const override { return _buffer.get(); } event::ptr fill(stream& stream, unsigned char pattern) override; event::ptr fill(stream& stream) override; diff --git a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp index e427323590b..fc4ec82297b 100644 --- a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp @@ -3410,68 +3410,130 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range, ::test convolution_test_params{ CASE_CONV_S8S8_15, 2, 3 }, })); +struct convolution_eltw_sum_test_params { + tensor in_shape; + tensor out_shape; + tensor kernel; + ov::Strides stride; + ov::CoordinateDiff pad; + ov::Strides dilation; + uint32_t groups; + data_types data_type; + format input_format; + data_types weights_type; + format weights_format; + data_types eltw_type; + format eltw_format; + data_types out_type; + format out_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; -// input:b_fs_yx_fsv32:u8 X weight:bfyx:i8 + eltwise_sum:b_fs_yx_fsv32:u8 -// After optimization: eltwise_any + binary_add -// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_tanh+binary_add:u8:14:aBcd32b+eltwise_linear:1 -class post_ops_optimizations_onednn_binary_add_full_tensor : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_onednn_binary_add_full_tensor, basic) { +class EltwiseSumFusingTestOneDNN : public BaseFusingTest { +public: + void execute(convolution_eltw_sum_test_params& p) { + auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim") + return true; + return false; + }); + + if (info_fused != pi_fused.end() && engine.get_device_info().supports_immad) { + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + EXPECT_TRUE(info_fused->kernel_id.find("jit:ir") != std::string::npos); + } + } + + layout get_input_layout(convolution_eltw_sum_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, static_cast(pad[0]), static_cast(pad[1]) }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(convolution_eltw_sum_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; + } +}; + +class onednn_binary_add_full_tensor : public EltwiseSumFusingTestOneDNN {}; +TEST_P(onednn_binary_add_full_tensor, basic) { auto p = GetParam(); + create_topologies( input_layout("input", get_input_layout(p)), data("weights", get_mem(get_weights_layout(p))), data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_single_element_layout(p), 0)), - data("in_hi", get_mem(get_single_element_layout(p), 255)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - data("eltwise_data", get_mem(get_output_layout(p), 0, 255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + data("in_lo1", get_mem(get_single_element_layout(p), 0)), + data("in_hi1", get_mem(get_single_element_layout(p), 100)), + data("out_lo1", get_mem(get_single_element_layout(p), 0)), + data("out_hi1", get_mem(get_single_element_layout(p), 100)), + data("eltwise_data", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, false), activation("activation", "conv_prim", activation_func::hyperbolic_tan), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum), - quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + quantize("quantize1", "activation", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 256, p.out_type), + eltwise("sum", { "quantize1", "eltwise_data" }, eltwise_mode::sum, p.out_type), + reorder("reorder_bfyx", "sum", p.default_format, p.default_type) ); tolerance = 1.f; execute(p); } -// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; -#define CASE_CONV_U8S8_FT_BINARY_ADD_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, tensor{ 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::f32, format::bfyx +// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; eltw_type; eltw_format; out_type; out_format; default_type; default_format; +#define CASE_CONV_ELTW_SUM_BINARY_ADD_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, { 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::u8, format::b_fs_yx_fsv32, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_SUM_SUM_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, { 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::u8, format::b_fs_yx_fsv32, data_types::u8, format::b_fs_yx_fsv32, data_types::f32, format::bfyx -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_full_tensor, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - convolution_test_params{ CASE_CONV_U8S8_FT_BINARY_ADD_1, 2, 5 }, +INSTANTIATE_TEST_SUITE_P(eltwise_sum_fusings_gpu, onednn_binary_add_full_tensor, ::testing::ValuesIn(std::vector{ + convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_BINARY_ADD_1, 2, 5 }, + convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_SUM_1, 2, 5 }, })); -// input:b_fs_yx_fsv16:f16 X weight:bfyx:f16 + eltwise_sum:b_fs_yx_fsv16:f16 -// After optimization: eltwise_any + sum -// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_tanh+sum:1:0:f16 -class post_ops_optimizations_onednn_sum_full_tensor : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_onednn_sum_full_tensor, basic) { +class onednn_multiple_binary_add_full_tensor : public EltwiseSumFusingTestOneDNN {}; +TEST_P(onednn_multiple_binary_add_full_tensor, basic) { auto p = GetParam(); + create_topologies( input_layout("input", get_input_layout(p)), data("weights", get_mem(get_weights_layout(p))), data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + data("in_lo1", get_mem(get_single_element_layout(p), 0)), + data("in_hi1", get_mem(get_single_element_layout(p), 100)), + data("out_lo1", get_mem(get_single_element_layout(p), 0)), + data("out_hi1", get_mem(get_single_element_layout(p), 100)), + data("eltwise_data", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)), + data("eltwise_data1", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)), + data("eltwise_data2", get_mem(layout{ p.eltw_type, format::bfyx, tensor{ 1, p.out_shape.feature[0], 1, 1 } }, 0, 100)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, false), activation("activation", "conv_prim", activation_func::hyperbolic_tan), - data("eltwise_data", get_mem(get_output_layout(p), 0, 255)), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + quantize("quantize1", "activation", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 256, p.out_type), + eltwise("sum", { "quantize1", "eltwise_data" }, eltwise_mode::sum, p.out_type), // eltwise sum with full tensor + eltwise("sum1", { "sum", "eltwise_data1" }, eltwise_mode::sum, p.out_type), // eltwise sum with full tensor + eltwise("sum2", { "sum1", "eltwise_data2" }, eltwise_mode::sum, p.out_type), // eltwise sum with broadcasting + reorder("reorder_bfyx", "sum2", p.default_format, p.default_type) ); tolerance = 1.f; execute(p); } -#define CASE_CONV_F16F16_FT_ELTW_SUM_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, tensor{ 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx, data_types::f32, format::bfyx - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_sum_full_tensor, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - convolution_test_params{ CASE_CONV_F16F16_FT_ELTW_SUM_1, 2, 4 }, +INSTANTIATE_TEST_SUITE_P(multiple_eltwise_sum_fusings_gpu, onednn_multiple_binary_add_full_tensor, ::testing::ValuesIn(std::vector{ + convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_BINARY_ADD_1, 2, 7 }, + convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_SUM_1, 2, 7 }, })); + #endif // ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp index 96a3dad9d1a..8ef703bdc14 100644 --- a/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp +++ b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp @@ -227,4 +227,10 @@ public: topology_fused.add(args...); topology_non_fused.add(args...); } + + template + void add_topologies(Args const&... args) { + topology_fused.add(args...); + topology_non_fused.add(args...); + } };