diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp index 9206820288a..c02c3b8ddf2 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp @@ -31,7 +31,6 @@ void handle_reshape::run(program& p) { auto output_lay = node.get_output_layout(); if (!node.is_in_place() || - !node.get_fused_activations_funcs().empty() || node.has_fused_primitives()) return; @@ -55,7 +54,7 @@ void handle_reshape::run(program& p) { while (node_itr != p.get_processing_order().end()) { auto& node = (*node_itr++); program_helpers::do_for_types(*node, [&p](reshape_node& node) { - if (node.is_output() || node.get_users().size() > 1 || !node.get_fused_activations_funcs().empty()) + if (node.is_output() || node.get_users().size() > 1 || node.has_fused_primitives()) return; auto& out_node = node.get_users().front(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/pre_replace_deconv.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/pre_replace_deconv.cpp index 40cdf97fa4e..2fa294fe8be 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/pre_replace_deconv.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/pre_replace_deconv.cpp @@ -306,14 +306,20 @@ void pre_replace_deconv::run(program& p) { auto pixel_shuffle_prim = std::make_shared(deconv_node_id, deconv_id_conv, 2, depth_to_space_mode::blocks_first); program_node& pixel_shuffle_node = p.get_or_create(pixel_shuffle_prim); - pixel_shuffle_node.add_fused_activation(activation_func::linear, { 1, bias }); + auto bias_id = deconv_node_id + "_bias"; + auto bias_prim = std::make_shared(bias_id, + input_info(deconv_node_id), + activation_func::linear, + activation_additional_params{ 1, bias }); + program_node& bias_node = p.get_or_create(bias_prim); - // add connections input->convolution, weights->convolution + // add connections input->depth_to_space, depth_to_space->bias p.add_connection(conv_node, pixel_shuffle_node); + p.add_connection(pixel_shuffle_node, bias_node); auto deconv_node_ptr = p.nodes_map.find(rename_id); if (deconv_node_ptr != p.nodes_map.end()) { - p.replace_all_usages(*deconv_node_ptr->second, pixel_shuffle_node); + p.replace_all_usages(*deconv_node_ptr->second, bias_node); p.optimized_out.push_back(rename_id); p.nodes_map.erase(rename_id); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index a0720b09bcc..e2d2c12e1f7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -67,9 +67,7 @@ bool concat_noop_optimization::match(concatenation_node& node) { return false; if (node.is_dynamic()) return false; - return node.get_dependencies().size() == 1 && - !node.has_fused_primitives() && - node.get_fused_activations_funcs().empty(); + return node.get_dependencies().size() == 1 && !node.has_fused_primitives(); } bool concat_noop_optimization::optimize(concatenation_node& node) { @@ -84,7 +82,7 @@ bool concat_noop_optimization::optimize(concatenation_node& node) { bool concat_in_place_optimization::match(concatenation_node& node) { if (node.is_output()) return false; - if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty()) + if (node.has_fused_primitives()) return false; if (node.is_dynamic()) return false; @@ -300,7 +298,7 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st } // namespace static bool can_reshape_be_optimized(const reshape_node& node) { - return node.is_in_place() && node.get_fused_activations_funcs().empty(); + return node.is_in_place() && !node.has_fused_primitives(); } // ToDo remove friendship relation from program_node @@ -322,11 +320,11 @@ void prepare_buffer_fusing::run(program& p) { // The condition below check only output layout as cases like // (dyn_shape) -> reshape -> (static_shape) -> some_static_primitive // may have invalid set_arguments call as output memory of reshape won't be available until reshape primitive is executed - if (node->is_type() && is_dynamic && is_planar && no_pad && !node->is_output() && node->get_fused_activations_funcs().empty()) { + if (node->is_type() && is_dynamic && is_planar && no_pad && !node->is_output() && !node->has_fused_primitives()) { return true; } - if (node->is_dynamic() || node->is_output() || (!node->get_fused_activations_funcs().empty())) { + if (node->is_dynamic() || node->is_output() || node->has_fused_primitives()) { return false; } return true; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 699bd2b9b7a..9efe98c1b5c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -58,7 +58,6 @@ void prepare_primitive_fusing::run(program& p) { fuse_sigmoid_mul_to_swish(p); fuse_bias(p); fuse_simple_primitives(p); - fuse_activations(p); optimize_fused_ops(p); } @@ -226,105 +225,6 @@ void prepare_primitive_fusing::fuse_reorders(program &p) { } } -void prepare_primitive_fusing::fuse_activations(program &p) { - std::map>> fusing_history; - bool use_onednn_impls = false; - -#ifdef ENABLE_ONEDNN_FOR_GPU - auto& engine = p.get_engine(); - if (engine.get_device_info().supports_immad && p.get_config().get_property(ov::intel_gpu::queue_type) == QueueTypes::in_order) - use_onednn_impls = true; -#endif - - auto itr = p.get_processing_order().begin(); - while (itr != p.get_processing_order().end()) { - auto node_itr = itr++; - auto& node = (*node_itr); - - program_helpers::do_for_types(*node, [&p, &fusing_history, &use_onednn_impls](activation_node& node) { - auto& input = node.input(); - auto id = node.id(); - // Restrictions: - // - inputs cannot be padded - // - primitives input cannot be output - // - no activation additional input - // - input was optimized - // - can't have fused primitives - if (node.has_padded_dependency() || input.is_output() || node.is_output() || - node.get_dependencies().size() != 1 || input.can_be_optimized() || node.is_constant() || - node.has_fused_primitives()) - return; - - if (use_onednn_impls && node.get_primitive()->activation_function == cldnn::activation_func::hyperbolic_tan) { - return; - } - - // - limit to primitives which implementations support activation fusing - if (input.get_users().size() != 1 || - // TODO: new api needs to be created to read such caps - // right now use whitelist so no new primitives will be affected in case of lack of fused activation - // support - (!input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && - !input.is_type() && - !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && - !input.is_type())) - return; - - if (input.is_type()) { - bool is_quantization = true; - for (auto& in : input.get_dependencies()) { - if (!data_type_traits::is_i8_u8(in.first->get_output_layout().data_type)) - is_quantization = false; - } - - // TODO: Add new fused ops mechanism support to eltwise kernel in order to enable fusings in case of quantization - if (is_quantization) - return; - } - - if (use_onednn_impls) { - if (input.is_type() || input.is_type()) - return; - #ifdef ENABLE_ONEDNN_FOR_GPU - // Activation should not be fused if it isn't supported in onednn - try { - onednn::convert_activation_func(node.get_primitive()->activation_function); - } catch (...) { - return; - } - #endif - } - - if (input.get_fused_primitives().empty()) { - input.add_fused_activation(node.get_primitive()->activation_function, node.get_primitive()->additional_params); - for (size_t i = 0; i < node.get_fused_activations_funcs().size(); i++) { - input.add_fused_activation(node.get_fused_activations_funcs()[i], - node.get_fused_activations_params()[i]); - } - auto outputPadding = node.get_output_layout().data_padding; - input.set_output_padding(outputPadding); - p.extract_and_remove(node); - } else { - // If node already has any fused node using new mechanism, - // we can just use the same way and handle any amount of activations - p.fuse_nodes(input, node, &fusing_history); - } - - p.add_optimized_primitive_info(id, {input.id()}); - }); - } -} - void prepare_primitive_fusing::fuse_bias(program &p) { auto itr = p.get_processing_order().begin(); while (itr != p.get_processing_order().end()) { @@ -781,11 +681,11 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { return; } - auto& input_data = activation_node.get_dependency(0); + auto& input = activation_node.get_dependency(0); if (activation_node.get_dependencies().size() >= 3) return; - if (!input_data_supports_fusings(input_data, activation_node.id()) || input_data.get_dependencies().empty()) + if (!input_data_supports_fusings(input, activation_node.id()) || input.get_dependencies().empty()) return; if (_lo.get_optimization_attributes().use_onednn_impls) { @@ -799,58 +699,86 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { #endif } - bool should_fuse = input_data.is_type(); + bool should_fuse = input.is_type(); - should_fuse |= input_data.is_type() && conv_supports_fusings(input_data.as()); + should_fuse |= input.is_type() && conv_supports_fusings(input.as()); - should_fuse |= input_data.is_type() && fc_supports_fusings(input_data.as()); + should_fuse |= input.is_type() && fc_supports_fusings(input.as()); - should_fuse |= input_data.is_type() && gemm_supports_fusings(input_data.as()); + should_fuse |= input.is_type() && gemm_supports_fusings(input.as()); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type() && data_type_traits::is_i8_u8(input_data.get_dependency(0).get_output_layout().data_type); + should_fuse |= input.is_type() && data_type_traits::is_i8_u8(input.get_dependency(0).get_output_layout().data_type); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input.is_type(); - should_fuse |= input_data.is_type() && reduce_supports_fusings(input_data.as()); + should_fuse |= input.is_type() && reduce_supports_fusings(input.as()); - should_fuse |= input_data.is_type() && eltwise_supports_fusings(input_data.as()); + should_fuse |= input.is_type() && eltwise_supports_fusings(input.as()); + + bool legacy_fusion = activation_node.get_dependencies().size() == 1 && + !input.can_be_optimized() && + !activation_node.is_constant() && + !activation_node.has_fused_primitives() && + (input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type() || + input.is_type()); + + if (!should_fuse && legacy_fusion) { + GPU_DEBUG_LOG << activation_node.id() << " is fused by legacy conditions! Consider adding selected kernel with fused ops support\n"; + } + + should_fuse |= legacy_fusion; if (!should_fuse) return; - p.fuse_nodes(input_data, activation_node, &fusing_history); + p.fuse_nodes(input, activation_node, &fusing_history); }; auto fuse_quantize_f = [&](quantize_node& quantize_node) { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index e0a0685e75d..ee0bc916a6c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -61,7 +61,7 @@ void remove_redundant_reorders::run(program& p) { if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty()) continue; - if (!node.get_fused_activations_funcs().empty()) + if (node.has_fused_primitives()) continue; std::function has_quantize_user; @@ -149,7 +149,7 @@ void remove_redundant_reorders::run(program& p) { !r_dep_node.has_mean() && r_dep_node.get_primitive()->subtract_per_feature.empty() && !r_dep_node.is_output() && - r_dep_node.get_fused_activations_funcs().empty() && + !r_dep_node.has_fused_primitives() && !r_dep_node.get_primitive()->has_surface_input(); // for chains like @@ -165,7 +165,7 @@ void remove_redundant_reorders::run(program& p) { !r_dep_node.is_output() && !r_node.has_mean() && r_node.get_primitive()->subtract_per_feature.empty() && - r_node.get_fused_activations_funcs().empty() && + !r_node.has_fused_primitives() && !r_node.get_primitive()->has_surface_input(); if (remove_dep) { @@ -205,7 +205,7 @@ void remove_redundant_reorders::run(program& p) { r_node.has_mean() || r_node.get_users().size() > 1 || r_node.get_primitive()->subtract_per_feature.size() || - r_node.get_fused_activations_funcs().size()) + r_node.has_fused_primitives()) continue; if (!r_node.get_users().front()->is_type()) @@ -258,7 +258,7 @@ void remove_redundant_reorders::run(program& p) { if (r_node.has_mean() || !r_node.get_primitive()->subtract_per_feature.empty() || no_output_optimization || - !r_node.get_fused_activations_funcs().empty() || + r_node.has_fused_primitives() || r_node.get_primitive()->has_surface_input()) continue; @@ -335,7 +335,7 @@ void remove_redundant_reorders::run(program& p) { if (user->is_type() && user != node && !user->is_output() && - user->get_fused_activations_funcs().empty()) { + !user->has_fused_primitives()) { auto l1 = node->get_output_layout(); auto l2 = user->get_output_layout(); @@ -382,7 +382,7 @@ void remove_redundant_reorders::run(program& p) { if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty()) continue; - if (!node.get_fused_activations_funcs().empty()) + if (node.has_fused_primitives()) continue; if (input.get_users().size() != 1) @@ -530,7 +530,6 @@ void remove_redundant_reorders::run(program& p) { local_desc.f_param = node->get_fuse_params(); local_desc.dep_start_idx = input.get_fused_primitives().size(); local_desc.output_layout = output_layout; - local_desc.activation = activation_func::none; input.add_fused_primitive(local_desc); // remove reorder node @@ -561,8 +560,7 @@ void remove_redundant_reorders::run(program& p) { !r_node.get_primitive()->subtract_per_feature.empty()) continue; - if (!r_node.get_fused_activations_funcs().empty() || - !r_node.get_fused_primitives().empty()) + if (r_node.has_fused_primitives()) continue; // Remove reorder for Convolution bfyx -> fs_b_yx_fsv32 @@ -596,10 +594,10 @@ void remove_redundant_reorders::run(program& p) { auto& reshape_input_node = dep_node.as(); bool remove_dep = reshape_input_node.get_users().size() == 1 && !reshape_input_node.is_output() && - reshape_input_node.get_fused_activations_funcs().empty() && reshape_input_node.get_fused_primitives().empty(); + !reshape_input_node.has_fused_primitives(); bool remove_current = remove_dep && !reshape_input_node.get_dependencies().empty() && reshape_input_node.get_dependency(0).get_output_layout() == reshape_node.get_output_layout() && - reshape_node.get_fused_activations_funcs().empty() && reshape_node.get_fused_primitives().empty(); + reshape_node.has_fused_primitives(); if (remove_dep) { LOG_NODE_REMOVAL(reshape_input_node.id()); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp index 6c10b1be56b..61c4589a7e1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp @@ -10,6 +10,14 @@ #include "kernel_selector_helper.h" #include "primitive_base.hpp" +namespace { +inline void convert_new_activation_func(const activation& prim, std::vector& params) { + params.insert(params.begin(), {get_kernel_selector_activation_param(prim.activation_function), + prim.additional_params.a, + prim.additional_params.b}); +} +} // namespace + namespace cldnn { namespace ocl { @@ -40,7 +48,7 @@ struct activation_impl : typed_primitive_impl_ocl { auto params = get_default_params(impl_param); auto optional_params = get_default_optional_params(impl_param.get_program()); - convert_new_activation_func(primitive, params.activations); + convert_new_activation_func(*primitive, params.activations); bool is_parameterized = !primitive->additional_params_input.empty(); if (is_parameterized) { diff --git a/src/plugins/intel_gpu/src/graph/include/fused_primitive_desc.h b/src/plugins/intel_gpu/src/graph/include/fused_primitive_desc.h index 5262ec9df45..f3a22dd83e1 100644 --- a/src/plugins/intel_gpu/src/graph/include/fused_primitive_desc.h +++ b/src/plugins/intel_gpu/src/graph/include/fused_primitive_desc.h @@ -42,9 +42,6 @@ struct fused_primitive_desc { std::map fused_deps; size_t dep_start_idx; size_t total_num_deps = 0; - - activation_func activation; - activation_additional_params activation_params = { 0.f, 0.f }; }; #ifdef ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/graph/include/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/include/kernel_selector_helper.h index 691f322e148..500937d9aac 100644 --- a/src/plugins/intel_gpu/src/graph/include/kernel_selector_helper.h +++ b/src/plugins/intel_gpu/src/graph/include/kernel_selector_helper.h @@ -119,8 +119,6 @@ struct kernel_impl_params { #ifdef ENABLE_ONEDNN_FOR_GPU std::vector fused_desc_onednn; #endif // ENABLE_ONEDNN_FOR_GPU - std::vector fused_act_funcs; - std::vector activation_params; optional_layout weights_layout = optional_layout(); @@ -141,9 +139,7 @@ struct kernel_impl_params { size_t _uid, const std::vector& _in_layouts, const std::vector& _out_layouts, - const std::vector& _fused_descs, - const std::vector& _fused_act_funcs, - const std::vector& _act_params) + const std::vector& _fused_descs) : has_runtime_layouts(true) , prog(&_prog) , desc(_desc) @@ -151,8 +147,6 @@ struct kernel_impl_params { , input_layouts(_in_layouts) , output_layouts(_out_layouts) , fused_desc(_fused_descs) - , fused_act_funcs(_fused_act_funcs) - , activation_params(_act_params) , primary_input_idx(0) { } @@ -208,31 +202,8 @@ kernel_selector::dim_tensor convert_dim_vector(const tensor& t) { static_cast(sizes[5])}; } -template -inline void convert_activation_func_params(const p_type primitive, std::vector& params) { - const float negative_slope = primitive->activation_negative_slope; - if (negative_slope != 0.0f) { - params.emplace_back(kernel_selector::activation_function::RELU_NEGATIVE_SLOPE, negative_slope, 0.0f); - } else { - params.emplace_back(kernel_selector::activation_function::RELU, 0.0f, 0.0f); - } -} - -inline void convert_fused_activation_func_params(const kernel_impl_params& param_info, std::vector& params) { - const auto& act_funcs = param_info.fused_act_funcs; - const auto& act_params = param_info.activation_params; - for (size_t i = 0; i < act_funcs.size(); i++) { - params.emplace_back(get_kernel_selector_activation_param(act_funcs[i]), - act_params[i].a, - act_params[i].b); - } -} -template -inline void convert_new_activation_func(const p_type primitive, std::vector& params) { - params.insert(params.begin(), {get_kernel_selector_activation_param(primitive->activation_function), - primitive->additional_params.a, - primitive->additional_params.b}); -} +void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector& activations); +bool use_legacy_fused_ops(const kernel_impl_params& param_info); void set_params(const kernel_impl_params& param_info, kernel_selector::params& params); @@ -249,56 +220,62 @@ inline params_t get_default_params(const kernel_impl_params& param_info) { params.outputs[0] = convert_data_tensor(output_layout); params.layerID = param_info.desc->id; - convert_fused_activation_func_params(param_info, params.activations); - std::map> prim_id_type_map; - size_t op_id = 0; - for (auto& fused_prim : param_info.fused_desc) { - kernel_selector::fused_operation_desc desc; - desc.op_params = std::move(fused_prim.f_param); + if (use_legacy_fused_ops(param_info)) { + // Single activation is converted to legacy fused ops format to keep good performance + // TODO: Remove it once all kernels supports new fused ops mechanism + convert_fused_ops_to_legacy_activations(param_info, params.activations); + } else { + std::map> prim_id_type_map; + size_t op_id = 0; + for (auto& fused_prim : param_info.fused_desc) { + kernel_selector::fused_operation_desc desc; + desc.op_params = std::move(fused_prim.f_param); - if (!desc.op_params) { - CLDNN_ERROR_MESSAGE(param_info.desc->id, "Invalid fused operation (" + param_info.desc->id + ") of type " + - param_info.desc->type_string()); - } + if (!desc.op_params) { + CLDNN_ERROR_MESSAGE(param_info.desc->id, "Invalid fused operation (" + param_info.desc->id + ") of type " + + param_info.desc->type_string()); + } - desc.dep_idx_start = fused_prim.dep_start_idx; - desc.dep_size = fused_prim.deps.size(); - desc.op_id = op_id++; - desc.output_tensor = convert_data_tensor(fused_prim.output_layout); - prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType()); + desc.dep_idx_start = fused_prim.dep_start_idx; + desc.dep_size = fused_prim.deps.size(); + desc.op_id = op_id++; + desc.output_tensor = convert_data_tensor(fused_prim.output_layout); + prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType()); - for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) { - desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i))); - } + for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) { + desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i))); + } - if (fused_prim.total_num_deps > 0) { - desc.dep_data.resize(fused_prim.total_num_deps); - for (auto& dep : fused_prim.fused_deps) { - auto iter = prim_id_type_map.find(dep.first); - if (iter != prim_id_type_map.end()) { - auto& op_data = iter->second; - desc.dep_data[dep.second].dep_type = kernel_selector::DepType::INTERNAL; - desc.dep_data[dep.second].op_id = op_data.first; - desc.dep_data[dep.second].data_type = op_data.second; - } - } - - int idx = 0; - for (auto& dep : fused_prim.deps) { - desc.dep_data[dep.second].dep_type = kernel_selector::DepType::EXTERNAL; - desc.dep_data[dep.second].op_id = idx; - desc.dep_data[dep.second].data_type = desc.tensors[idx++].GetDType(); - } - - for (auto& dep : desc.dep_data) { - if (dep.dep_type == kernel_selector::DepType::UNDEFINED) { - dep.dep_type = kernel_selector::DepType::ORIGINAL; - break; + if (fused_prim.total_num_deps > 0) { + desc.dep_data.resize(fused_prim.total_num_deps); + for (auto& dep : fused_prim.fused_deps) { + auto iter = prim_id_type_map.find(dep.first); + if (iter != prim_id_type_map.end()) { + auto& op_data = iter->second; + desc.dep_data[dep.second].dep_type = kernel_selector::DepType::INTERNAL; + desc.dep_data[dep.second].op_id = op_data.first; + desc.dep_data[dep.second].data_type = op_data.second; + } + } + + int idx = 0; + for (auto& dep : fused_prim.deps) { + desc.dep_data[dep.second].dep_type = kernel_selector::DepType::EXTERNAL; + desc.dep_data[dep.second].op_id = idx; + desc.dep_data[dep.second].data_type = desc.tensors[idx++].GetDType(); + } + + for (auto& dep : desc.dep_data) { + if (dep.dep_type == kernel_selector::DepType::UNDEFINED) { + dep.dep_type = kernel_selector::DepType::ORIGINAL; + break; + } } } + params.fused_ops.push_back(desc); } - params.fused_ops.push_back(desc); } + return params; } diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index b6797372bfd..398a468f5f1 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -198,7 +198,6 @@ private: void fuse_sigmoid_mul_to_swish(program &p); void fuse_bias(program &p); void fuse_reorders(program& p); - void fuse_activations(program& p); void fuse_simple_primitives(program &p); void optimize_fused_ops(program &p); void remove_redundant_reshape(program &p); diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index 35f869ec73d..00a67dd0c4c 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -91,8 +91,7 @@ public: virtual std::unique_ptr get_kernel_impl_params(const std::vector& in_layouts, const std::vector& out_layouts) const { auto params = std::unique_ptr(new kernel_impl_params(get_program(), get_primitive(), get_unique_id(), in_layouts, out_layouts, - get_fused_primitives(), - get_fused_activations_funcs(), get_fused_activations_params())); + get_fused_primitives())); params->memory_deps = get_const_memory_deps(); auto deps = get_dependencies(); @@ -245,33 +244,6 @@ public: void unmark() { user_mark = 0; } bool is_marked() const { return user_mark != 0; } - void add_fused_activation(activation_func activation_func, - activation_additional_params additional_params) { - fused_activations.emplace_back(activation_func, additional_params); - } - - std::vector get_fused_activations_funcs() const { - std::vector funcs; - std::transform(fused_activations.begin(), - fused_activations.end(), - std::back_inserter(funcs), - [](fused_activation_params const& p) { return p.func; }); - return funcs; - } - - std::vector get_fused_activations_params() const { - std::vector params; - std::transform(fused_activations.begin(), - fused_activations.end(), - std::back_inserter(params), - [](fused_activation_params const& p) { return p.params; }); - return params; - } - - void copy_fused_activation(const program_node& rhs) { - fused_activations = rhs.fused_activations; - } - // check/set if the node can be optimized out (removed from the network) bool can_be_optimized() const { return optimized; } void can_be_optimized(bool opt) { optimized = opt; } @@ -435,18 +407,6 @@ protected: const primitive_id org_id; - struct fused_activation_params { - activation_func func = activation_func::none; - activation_additional_params params = {0.0f, 0.0f}; - - fused_activation_params() {} - - fused_activation_params(activation_func _func, activation_additional_params _params) : - func(_func), - params(_params) {} - }; - - std::vector fused_activations; std::vector fused_prims; void invalidate_users() const; diff --git a/src/plugins/intel_gpu/src/graph/include/reshape_inst.h b/src/plugins/intel_gpu/src/graph/include/reshape_inst.h index ddf1a7c02c5..beb3b743b8b 100644 --- a/src/plugins/intel_gpu/src/graph/include/reshape_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/reshape_inst.h @@ -28,7 +28,7 @@ public: } bool is_in_place() const { - if (this->is_output() || !this->get_fused_activations_funcs().empty()) + if (this->is_output() || this->has_fused_primitives()) return false; return (!this->get_output_layout().data_padding && !input().get_output_layout(false).data_padding); } diff --git a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp index 28b42ca0589..657fd4d6586 100644 --- a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp @@ -12,6 +12,24 @@ #include "intel_gpu/graph/serialization/string_serializer.hpp" #include "intel_gpu/graph/serialization/vector_serializer.hpp" +#include "intel_gpu/primitives/concatenation.hpp" +#include "intel_gpu/primitives/convolution.hpp" +#include "intel_gpu/primitives/crop.hpp" +#include "intel_gpu/primitives/eltwise.hpp" +#include "intel_gpu/primitives/fully_connected.hpp" +#include "intel_gpu/primitives/normalize.hpp" +#include "intel_gpu/primitives/reorder.hpp" +#include "intel_gpu/primitives/reshape.hpp" +#include "intel_gpu/primitives/roi_pooling.hpp" +#include "intel_gpu/primitives/softmax.hpp" +#include "intel_gpu/primitives/depth_to_space.hpp" +#include "intel_gpu/primitives/shuffle_channels.hpp" +#include "intel_gpu/primitives/strided_slice.hpp" +#include "intel_gpu/primitives/cum_sum.hpp" +#include "intel_gpu/primitives/reverse_sequence.hpp" +#include "intel_gpu/primitives/embedding_bag.hpp" +#include "intel_gpu/primitives/extract_image_patches.hpp" + #include #include @@ -1008,6 +1026,54 @@ kernel_selector::activation_function get_kernel_selector_activation_param(activa } } +void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector& activations) { + auto op_desc = param_info.fused_desc[0].typed_desc(); + auto func = op_desc->activation_function; + auto params = op_desc->additional_params; + + activations.push_back({get_kernel_selector_activation_param(func), params.a, params.b}); +} + +bool use_legacy_fused_ops(const kernel_impl_params& param_info) { + const auto& fused_ops = param_info.fused_desc; + if (fused_ops.size() != 1) + return false; + + const auto& fused_op = fused_ops[0]; + if (!fused_op.is_type()) + return false; + + if (!fused_op.deps.empty()) + return false; + + + std::vector legacy_fusion_list = { + concatenation::type_id(), + convolution::type_id(), + crop::type_id(), + eltwise::type_id(), + fully_connected::type_id(), + normalize::type_id(), + reorder::type_id(), + reshape::type_id(), + roi_pooling::type_id(), + softmax::type_id(), + depth_to_space::type_id(), + shuffle_channels::type_id(), + strided_slice::type_id(), + cum_sum::type_id(), + reverse_sequence::type_id(), + embedding_bag::type_id(), + extract_image_patches::type_id() + }; + + if (std::find(legacy_fusion_list.begin(), legacy_fusion_list.end(), param_info.desc->type) == legacy_fusion_list.end()) { + return false; + } + + return true; +} + void set_params(const kernel_impl_params& param_info, kernel_selector::params& params) { const auto& program = param_info.prog; const auto& device_info = program->get_engine().get_device_info(); diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 871cb5c6e63..82850b475e4 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -1159,14 +1159,6 @@ void program::fuse_nodes(program_node &fused_node, local_desc.total_num_deps = peer_node.get_dependencies().size(); local_desc.input_layout = peer_node.get_dependency(0).get_output_layout(); local_desc.output_layout = peer_layout; - local_desc.activation = activation_func::none; - if (!peer_node.get_fused_activations_funcs().empty()) { - if (peer_node.get_fused_activations_funcs().size() > 1) - CLDNN_ERROR_MESSAGE(peer_node.id(), "Fused primitive descriptor doesn't support > 1 activation functions in a peer node"); - - local_desc.activation = peer_node.get_fused_activations_funcs()[0]; - local_desc.activation_params = peer_node.get_fused_activations_params()[0]; - } auto fusedPadding = fused_node.get_output_layout().data_padding; cldnn::padding needed_padding = padding::max(peer_layout.data_padding, diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 9736420f85d..2d6b95e53ac 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -131,20 +131,6 @@ std::unique_ptr program_node::desc_to_json() const { } node_info->add("fused primitives", fused_nodes_info); - json_composite fused_activations; - auto fused_activations_funcs = get_fused_activations_funcs(); - if (!fused_activations_funcs.empty()) { - for (size_t i = 0; i < fused_activations_funcs.size(); i++) { - json_composite fused_activation_info; - auto activation_type = activation_type_to_str(fused_activations_funcs[i]); - auto params = get_fused_activations_params()[i]; - fused_activation_info.add("params", "a=" + std::to_string(params.a) + ", b=" + std::to_string(params.b)); - fused_activation_info.add("activation", activation_type); - fused_activations.add("fused activation idx " + std::to_string(i), fused_activation_info); - } - node_info->add("fused activations (legacy)", fused_activations); - } - #ifdef ENABLE_ONEDNN_FOR_GPU auto& onednn_post_ops = get_fused_primitives_onednn(); if (onednn_post_ops.size()) { @@ -1174,27 +1160,6 @@ void program_node::init_onednn_primitive_attributes() { } } - if (cldnn_post_ops.size() && get_fused_activations_funcs().size()) - throw std::runtime_error("Unsupported mix of fused ops and activations"); - - for (size_t i = 0; i < get_fused_activations_funcs().size(); i++) { - auto activation_type = get_fused_activations_funcs()[i]; - if (activation_type == cldnn::activation_func::hsigmoid) { - // Unsupported hsigmoid oneDNN gpu, splits hsigmoid activation min(max(val + 3, 0), 6) / 6 - post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.f, 3.f); - post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, 0.f, 6.f); - post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1/6.f, 0.f); - update_onednn_post_op_list(onednn_post_op_type::eltwise_linear, empty_mem); - update_onednn_post_op_list(onednn_post_op_type::eltwise_clip, empty_mem); - update_onednn_post_op_list(onednn_post_op_type::eltwise_linear, empty_mem); - } else { - auto params = get_fused_activations_params()[i]; - dnnl::algorithm alg = onednn::convert_activation_func(activation_type); - post_ops.append_eltwise(alg, params.a, params.b); - update_onednn_post_op_list(onednn_post_op_type::eltwise_act, empty_mem); - } - } - // Trying to optimize more than 1 post-ops if (fused_ops.size() > 1) { dnnl::post_ops optimized_post_ops = post_ops; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_6x3_s1_fused.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_6x3_s1_fused.cl index af043d939f4..3784ffa9c39 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_6x3_s1_fused.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_6x3_s1_fused.cl @@ -69,7 +69,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused) __global INPUT0_TYPE* I, __global OUTPUT_TYPE* O, #if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB - __read_only image2d_t U, + __read_only image2d_t U #else __global FILTER_TYPE* U #endif