[GPU] Remove legacy activation fused ops from program (#15075)

* [GPU] Fix winograd kernel

* [GPU] Remove fusion via legacy mechanism

* [GPU] Conversion to legacy activations in kernel selector helper
This commit is contained in:
Vladimir Paramuzov 2023-01-20 09:55:53 +04:00 committed by GitHub
parent 65268d32df
commit 5b389860a1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 209 additions and 316 deletions

View File

@ -31,7 +31,6 @@ void handle_reshape::run(program& p) {
auto output_lay = node.get_output_layout();
if (!node.is_in_place() ||
!node.get_fused_activations_funcs().empty() ||
node.has_fused_primitives())
return;
@ -55,7 +54,7 @@ void handle_reshape::run(program& p) {
while (node_itr != p.get_processing_order().end()) {
auto& node = (*node_itr++);
program_helpers::do_for_types<reshape>(*node, [&p](reshape_node& node) {
if (node.is_output() || node.get_users().size() > 1 || !node.get_fused_activations_funcs().empty())
if (node.is_output() || node.get_users().size() > 1 || node.has_fused_primitives())
return;
auto& out_node = node.get_users().front();

View File

@ -306,14 +306,20 @@ void pre_replace_deconv::run(program& p) {
auto pixel_shuffle_prim = std::make_shared<depth_to_space>(deconv_node_id, deconv_id_conv, 2, depth_to_space_mode::blocks_first);
program_node& pixel_shuffle_node = p.get_or_create(pixel_shuffle_prim);
pixel_shuffle_node.add_fused_activation(activation_func::linear, { 1, bias });
auto bias_id = deconv_node_id + "_bias";
auto bias_prim = std::make_shared<activation>(bias_id,
input_info(deconv_node_id),
activation_func::linear,
activation_additional_params{ 1, bias });
program_node& bias_node = p.get_or_create(bias_prim);
// add connections input->convolution, weights->convolution
// add connections input->depth_to_space, depth_to_space->bias
p.add_connection(conv_node, pixel_shuffle_node);
p.add_connection(pixel_shuffle_node, bias_node);
auto deconv_node_ptr = p.nodes_map.find(rename_id);
if (deconv_node_ptr != p.nodes_map.end()) {
p.replace_all_usages(*deconv_node_ptr->second, pixel_shuffle_node);
p.replace_all_usages(*deconv_node_ptr->second, bias_node);
p.optimized_out.push_back(rename_id);
p.nodes_map.erase(rename_id);
}

View File

@ -67,9 +67,7 @@ bool concat_noop_optimization::match(concatenation_node& node) {
return false;
if (node.is_dynamic())
return false;
return node.get_dependencies().size() == 1 &&
!node.has_fused_primitives() &&
node.get_fused_activations_funcs().empty();
return node.get_dependencies().size() == 1 && !node.has_fused_primitives();
}
bool concat_noop_optimization::optimize(concatenation_node& node) {
@ -84,7 +82,7 @@ bool concat_noop_optimization::optimize(concatenation_node& node) {
bool concat_in_place_optimization::match(concatenation_node& node) {
if (node.is_output())
return false;
if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty())
if (node.has_fused_primitives())
return false;
if (node.is_dynamic())
return false;
@ -300,7 +298,7 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st
} // namespace
static bool can_reshape_be_optimized(const reshape_node& node) {
return node.is_in_place() && node.get_fused_activations_funcs().empty();
return node.is_in_place() && !node.has_fused_primitives();
}
// ToDo remove friendship relation from program_node
@ -322,11 +320,11 @@ void prepare_buffer_fusing::run(program& p) {
// The condition below check only output layout as cases like
// (dyn_shape) -> reshape -> (static_shape) -> some_static_primitive
// may have invalid set_arguments call as output memory of reshape won't be available until reshape primitive is executed
if (node->is_type<reshape>() && is_dynamic && is_planar && no_pad && !node->is_output() && node->get_fused_activations_funcs().empty()) {
if (node->is_type<reshape>() && is_dynamic && is_planar && no_pad && !node->is_output() && !node->has_fused_primitives()) {
return true;
}
if (node->is_dynamic() || node->is_output() || (!node->get_fused_activations_funcs().empty())) {
if (node->is_dynamic() || node->is_output() || node->has_fused_primitives()) {
return false;
}
return true;

View File

@ -58,7 +58,6 @@ void prepare_primitive_fusing::run(program& p) {
fuse_sigmoid_mul_to_swish(p);
fuse_bias(p);
fuse_simple_primitives(p);
fuse_activations(p);
optimize_fused_ops(p);
}
@ -226,105 +225,6 @@ void prepare_primitive_fusing::fuse_reorders(program &p) {
}
}
void prepare_primitive_fusing::fuse_activations(program &p) {
std::map<primitive_id, std::vector<std::pair<primitive_id, size_t>>> fusing_history;
bool use_onednn_impls = false;
#ifdef ENABLE_ONEDNN_FOR_GPU
auto& engine = p.get_engine();
if (engine.get_device_info().supports_immad && p.get_config().get_property(ov::intel_gpu::queue_type) == QueueTypes::in_order)
use_onednn_impls = true;
#endif
auto itr = p.get_processing_order().begin();
while (itr != p.get_processing_order().end()) {
auto node_itr = itr++;
auto& node = (*node_itr);
program_helpers::do_for_types<activation>(*node, [&p, &fusing_history, &use_onednn_impls](activation_node& node) {
auto& input = node.input();
auto id = node.id();
// Restrictions:
// - inputs cannot be padded
// - primitives input cannot be output
// - no activation additional input
// - input was optimized
// - can't have fused primitives
if (node.has_padded_dependency() || input.is_output() || node.is_output() ||
node.get_dependencies().size() != 1 || input.can_be_optimized() || node.is_constant() ||
node.has_fused_primitives())
return;
if (use_onednn_impls && node.get_primitive()->activation_function == cldnn::activation_func::hyperbolic_tan) {
return;
}
// - limit to primitives which implementations support activation fusing
if (input.get_users().size() != 1 ||
// TODO: new api needs to be created to read such caps
// right now use whitelist so no new primitives will be affected in case of lack of fused activation
// support
(!input.is_type<concatenation>() && !input.is_type<convolution>() &&
!input.is_type<crop>() && !input.is_type<deconvolution>() && !input.is_type<eltwise>() &&
!input.is_type<fully_connected>() && !input.is_type<lrn>() && !input.is_type<normalize>() &&
!input.is_type<permute>() && !input.is_type<pooling>() && !input.is_type<reorder>() &&
!input.is_type<reshape>() && !input.is_type<roi_pooling>() &&
!input.is_type<softmax>() && !input.is_type<resample>() && !input.is_type<mvn>() &&
!input.is_type<depth_to_space>() && !input.is_type<batch_to_space>() &&
!input.is_type<space_to_batch>() && !input.is_type<gather>() && !input.is_type<scatter_update>() && !input.is_type<shuffle_channels>() &&
!input.is_type<scatter_nd_update>() &&
!input.is_type<gather_nd>() &&
!input.is_type<gather_elements>() &&
!input.is_type<strided_slice>() && !input.is_type<cum_sum>() && !input.is_type<reverse_sequence>() &&
!input.is_type<embedding_bag>() && !input.is_type<extract_image_patches>() &&
!input.is_type<activation>()))
return;
if (input.is_type<eltwise>()) {
bool is_quantization = true;
for (auto& in : input.get_dependencies()) {
if (!data_type_traits::is_i8_u8(in.first->get_output_layout().data_type))
is_quantization = false;
}
// TODO: Add new fused ops mechanism support to eltwise kernel in order to enable fusings in case of quantization
if (is_quantization)
return;
}
if (use_onednn_impls) {
if (input.is_type<reshape>() || input.is_type<concatenation>())
return;
#ifdef ENABLE_ONEDNN_FOR_GPU
// Activation should not be fused if it isn't supported in onednn
try {
onednn::convert_activation_func(node.get_primitive()->activation_function);
} catch (...) {
return;
}
#endif
}
if (input.get_fused_primitives().empty()) {
input.add_fused_activation(node.get_primitive()->activation_function, node.get_primitive()->additional_params);
for (size_t i = 0; i < node.get_fused_activations_funcs().size(); i++) {
input.add_fused_activation(node.get_fused_activations_funcs()[i],
node.get_fused_activations_params()[i]);
}
auto outputPadding = node.get_output_layout().data_padding;
input.set_output_padding(outputPadding);
p.extract_and_remove(node);
} else {
// If node already has any fused node using new mechanism,
// we can just use the same way and handle any amount of activations
p.fuse_nodes(input, node, &fusing_history);
}
p.add_optimized_primitive_info(id, {input.id()});
});
}
}
void prepare_primitive_fusing::fuse_bias(program &p) {
auto itr = p.get_processing_order().begin();
while (itr != p.get_processing_order().end()) {
@ -781,11 +681,11 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
return;
}
auto& input_data = activation_node.get_dependency(0);
auto& input = activation_node.get_dependency(0);
if (activation_node.get_dependencies().size() >= 3)
return;
if (!input_data_supports_fusings(input_data, activation_node.id()) || input_data.get_dependencies().empty())
if (!input_data_supports_fusings(input, activation_node.id()) || input.get_dependencies().empty())
return;
if (_lo.get_optimization_attributes().use_onednn_impls) {
@ -799,58 +699,86 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
#endif
}
bool should_fuse = input_data.is_type<binary_convolution>();
bool should_fuse = input.is_type<binary_convolution>();
should_fuse |= input_data.is_type<convolution>() && conv_supports_fusings(input_data.as<convolution>());
should_fuse |= input.is_type<convolution>() && conv_supports_fusings(input.as<convolution>());
should_fuse |= input_data.is_type<fully_connected>() && fc_supports_fusings(input_data.as<fully_connected>());
should_fuse |= input.is_type<fully_connected>() && fc_supports_fusings(input.as<fully_connected>());
should_fuse |= input_data.is_type<gemm>() && gemm_supports_fusings(input_data.as<gemm>());
should_fuse |= input.is_type<gemm>() && gemm_supports_fusings(input.as<gemm>());
should_fuse |= input_data.is_type<pooling>();
should_fuse |= input.is_type<pooling>();
should_fuse |= input_data.is_type<resample>();
should_fuse |= input.is_type<resample>();
should_fuse |= input_data.is_type<mvn>();
should_fuse |= input.is_type<mvn>();
should_fuse |= input_data.is_type<normalize>() && data_type_traits::is_i8_u8(input_data.get_dependency(0).get_output_layout().data_type);
should_fuse |= input.is_type<normalize>() && data_type_traits::is_i8_u8(input.get_dependency(0).get_output_layout().data_type);
should_fuse |= input_data.is_type<deconvolution>();
should_fuse |= input.is_type<deconvolution>();
should_fuse |= input_data.is_type<permute>();
should_fuse |= input.is_type<permute>();
should_fuse |= input_data.is_type<activation>();
should_fuse |= input.is_type<activation>();
should_fuse |= input_data.is_type<lrn>();
should_fuse |= input.is_type<lrn>();
should_fuse |= input_data.is_type<gather>();
should_fuse |= input.is_type<gather>();
should_fuse |= input_data.is_type<gather_nd>();
should_fuse |= input.is_type<gather_nd>();
should_fuse |= input_data.is_type<gather_elements>();
should_fuse |= input.is_type<gather_elements>();
should_fuse |= input_data.is_type<scatter_update>();
should_fuse |= input.is_type<scatter_update>();
should_fuse |= input_data.is_type<scatter_nd_update>();
should_fuse |= input.is_type<scatter_nd_update>();
should_fuse |= input_data.is_type<scatter_elements_update>();
should_fuse |= input.is_type<scatter_elements_update>();
should_fuse |= input_data.is_type<depth_to_space>();
should_fuse |= input.is_type<depth_to_space>();
should_fuse |= input_data.is_type<space_to_depth>();
should_fuse |= input.is_type<space_to_depth>();
should_fuse |= input_data.is_type<batch_to_space>();
should_fuse |= input.is_type<batch_to_space>();
should_fuse |= input_data.is_type<space_to_batch>();
should_fuse |= input.is_type<space_to_batch>();
should_fuse |= input_data.is_type<reduce>() && reduce_supports_fusings(input_data.as<reduce>());
should_fuse |= input.is_type<reduce>() && reduce_supports_fusings(input.as<reduce>());
should_fuse |= input_data.is_type<eltwise>() && eltwise_supports_fusings(input_data.as<eltwise>());
should_fuse |= input.is_type<eltwise>() && eltwise_supports_fusings(input.as<eltwise>());
bool legacy_fusion = activation_node.get_dependencies().size() == 1 &&
!input.can_be_optimized() &&
!activation_node.is_constant() &&
!activation_node.has_fused_primitives() &&
(input.is_type<concatenation>() ||
input.is_type<convolution>() ||
input.is_type<crop>() ||
input.is_type<eltwise>() ||
input.is_type<fully_connected>() ||
input.is_type<normalize>() ||
input.is_type<reorder>() ||
input.is_type<reshape>() ||
input.is_type<roi_pooling>() ||
input.is_type<softmax>() ||
input.is_type<depth_to_space>() ||
input.is_type<shuffle_channels>() ||
input.is_type<strided_slice>() ||
input.is_type<cum_sum>() ||
input.is_type<reverse_sequence>() ||
input.is_type<embedding_bag>() ||
input.is_type<extract_image_patches>());
if (!should_fuse && legacy_fusion) {
GPU_DEBUG_LOG << activation_node.id() << " is fused by legacy conditions! Consider adding selected kernel with fused ops support\n";
}
should_fuse |= legacy_fusion;
if (!should_fuse)
return;
p.fuse_nodes(input_data, activation_node, &fusing_history);
p.fuse_nodes(input, activation_node, &fusing_history);
};
auto fuse_quantize_f = [&](quantize_node& quantize_node) {

View File

@ -61,7 +61,7 @@ void remove_redundant_reorders::run(program& p) {
if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
continue;
if (!node.get_fused_activations_funcs().empty())
if (node.has_fused_primitives())
continue;
std::function<bool(program_node&)> has_quantize_user;
@ -149,7 +149,7 @@ void remove_redundant_reorders::run(program& p) {
!r_dep_node.has_mean() &&
r_dep_node.get_primitive()->subtract_per_feature.empty() &&
!r_dep_node.is_output() &&
r_dep_node.get_fused_activations_funcs().empty() &&
!r_dep_node.has_fused_primitives() &&
!r_dep_node.get_primitive()->has_surface_input();
// for chains like
@ -165,7 +165,7 @@ void remove_redundant_reorders::run(program& p) {
!r_dep_node.is_output() &&
!r_node.has_mean() &&
r_node.get_primitive()->subtract_per_feature.empty() &&
r_node.get_fused_activations_funcs().empty() &&
!r_node.has_fused_primitives() &&
!r_node.get_primitive()->has_surface_input();
if (remove_dep) {
@ -205,7 +205,7 @@ void remove_redundant_reorders::run(program& p) {
r_node.has_mean() ||
r_node.get_users().size() > 1 ||
r_node.get_primitive()->subtract_per_feature.size() ||
r_node.get_fused_activations_funcs().size())
r_node.has_fused_primitives())
continue;
if (!r_node.get_users().front()->is_type<concatenation>())
@ -258,7 +258,7 @@ void remove_redundant_reorders::run(program& p) {
if (r_node.has_mean() ||
!r_node.get_primitive()->subtract_per_feature.empty() ||
no_output_optimization ||
!r_node.get_fused_activations_funcs().empty() ||
r_node.has_fused_primitives() ||
r_node.get_primitive()->has_surface_input())
continue;
@ -335,7 +335,7 @@ void remove_redundant_reorders::run(program& p) {
if (user->is_type<reorder>() &&
user != node &&
!user->is_output() &&
user->get_fused_activations_funcs().empty()) {
!user->has_fused_primitives()) {
auto l1 = node->get_output_layout();
auto l2 = user->get_output_layout();
@ -382,7 +382,7 @@ void remove_redundant_reorders::run(program& p) {
if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
continue;
if (!node.get_fused_activations_funcs().empty())
if (node.has_fused_primitives())
continue;
if (input.get_users().size() != 1)
@ -530,7 +530,6 @@ void remove_redundant_reorders::run(program& p) {
local_desc.f_param = node->get_fuse_params();
local_desc.dep_start_idx = input.get_fused_primitives().size();
local_desc.output_layout = output_layout;
local_desc.activation = activation_func::none;
input.add_fused_primitive(local_desc);
// remove reorder node
@ -561,8 +560,7 @@ void remove_redundant_reorders::run(program& p) {
!r_node.get_primitive()->subtract_per_feature.empty())
continue;
if (!r_node.get_fused_activations_funcs().empty() ||
!r_node.get_fused_primitives().empty())
if (r_node.has_fused_primitives())
continue;
// Remove reorder for Convolution bfyx -> fs_b_yx_fsv32
@ -596,10 +594,10 @@ void remove_redundant_reorders::run(program& p) {
auto& reshape_input_node = dep_node.as<reshape>();
bool remove_dep = reshape_input_node.get_users().size() == 1 && !reshape_input_node.is_output() &&
reshape_input_node.get_fused_activations_funcs().empty() && reshape_input_node.get_fused_primitives().empty();
!reshape_input_node.has_fused_primitives();
bool remove_current = remove_dep && !reshape_input_node.get_dependencies().empty() &&
reshape_input_node.get_dependency(0).get_output_layout() == reshape_node.get_output_layout() &&
reshape_node.get_fused_activations_funcs().empty() && reshape_node.get_fused_primitives().empty();
reshape_node.has_fused_primitives();
if (remove_dep) {
LOG_NODE_REMOVAL(reshape_input_node.id());

View File

@ -10,6 +10,14 @@
#include "kernel_selector_helper.h"
#include "primitive_base.hpp"
namespace {
inline void convert_new_activation_func(const activation& prim, std::vector<kernel_selector::base_activation_params>& params) {
params.insert(params.begin(), {get_kernel_selector_activation_param(prim.activation_function),
prim.additional_params.a,
prim.additional_params.b});
}
} // namespace
namespace cldnn {
namespace ocl {
@ -40,7 +48,7 @@ struct activation_impl : typed_primitive_impl_ocl<activation> {
auto params = get_default_params<kernel_selector::activation_params>(impl_param);
auto optional_params = get_default_optional_params<kernel_selector::activation_optional_params>(impl_param.get_program());
convert_new_activation_func(primitive, params.activations);
convert_new_activation_func(*primitive, params.activations);
bool is_parameterized = !primitive->additional_params_input.empty();
if (is_parameterized) {

View File

@ -42,9 +42,6 @@ struct fused_primitive_desc {
std::map<primitive_id, size_t> fused_deps;
size_t dep_start_idx;
size_t total_num_deps = 0;
activation_func activation;
activation_additional_params activation_params = { 0.f, 0.f };
};
#ifdef ENABLE_ONEDNN_FOR_GPU

View File

@ -119,8 +119,6 @@ struct kernel_impl_params {
#ifdef ENABLE_ONEDNN_FOR_GPU
std::vector<cldnn::fused_primitive_desc_onednn> fused_desc_onednn;
#endif // ENABLE_ONEDNN_FOR_GPU
std::vector<activation_func> fused_act_funcs;
std::vector<activation_additional_params> activation_params;
optional_layout weights_layout = optional_layout();
@ -141,9 +139,7 @@ struct kernel_impl_params {
size_t _uid,
const std::vector<layout>& _in_layouts,
const std::vector<layout>& _out_layouts,
const std::vector<cldnn::fused_primitive_desc>& _fused_descs,
const std::vector<activation_func>& _fused_act_funcs,
const std::vector<activation_additional_params>& _act_params)
const std::vector<cldnn::fused_primitive_desc>& _fused_descs)
: has_runtime_layouts(true)
, prog(&_prog)
, desc(_desc)
@ -151,8 +147,6 @@ struct kernel_impl_params {
, input_layouts(_in_layouts)
, output_layouts(_out_layouts)
, fused_desc(_fused_descs)
, fused_act_funcs(_fused_act_funcs)
, activation_params(_act_params)
, primary_input_idx(0) {
}
@ -208,31 +202,8 @@ kernel_selector::dim_tensor<T> convert_dim_vector(const tensor& t) {
static_cast<T>(sizes[5])};
}
template <typename p_type>
inline void convert_activation_func_params(const p_type primitive, std::vector<kernel_selector::base_activation_params>& params) {
const float negative_slope = primitive->activation_negative_slope;
if (negative_slope != 0.0f) {
params.emplace_back(kernel_selector::activation_function::RELU_NEGATIVE_SLOPE, negative_slope, 0.0f);
} else {
params.emplace_back(kernel_selector::activation_function::RELU, 0.0f, 0.0f);
}
}
inline void convert_fused_activation_func_params(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& params) {
const auto& act_funcs = param_info.fused_act_funcs;
const auto& act_params = param_info.activation_params;
for (size_t i = 0; i < act_funcs.size(); i++) {
params.emplace_back(get_kernel_selector_activation_param(act_funcs[i]),
act_params[i].a,
act_params[i].b);
}
}
template <typename p_type>
inline void convert_new_activation_func(const p_type primitive, std::vector<kernel_selector::base_activation_params>& params) {
params.insert(params.begin(), {get_kernel_selector_activation_param(primitive->activation_function),
primitive->additional_params.a,
primitive->additional_params.b});
}
void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& activations);
bool use_legacy_fused_ops(const kernel_impl_params& param_info);
void set_params(const kernel_impl_params& param_info, kernel_selector::params& params);
@ -249,56 +220,62 @@ inline params_t get_default_params(const kernel_impl_params& param_info) {
params.outputs[0] = convert_data_tensor(output_layout);
params.layerID = param_info.desc->id;
convert_fused_activation_func_params(param_info, params.activations);
std::map<primitive_id, std::pair<size_t, kernel_selector::Datatype>> prim_id_type_map;
size_t op_id = 0;
for (auto& fused_prim : param_info.fused_desc) {
kernel_selector::fused_operation_desc desc;
desc.op_params = std::move(fused_prim.f_param);
if (use_legacy_fused_ops(param_info)) {
// Single activation is converted to legacy fused ops format to keep good performance
// TODO: Remove it once all kernels supports new fused ops mechanism
convert_fused_ops_to_legacy_activations(param_info, params.activations);
} else {
std::map<primitive_id, std::pair<size_t, kernel_selector::Datatype>> prim_id_type_map;
size_t op_id = 0;
for (auto& fused_prim : param_info.fused_desc) {
kernel_selector::fused_operation_desc desc;
desc.op_params = std::move(fused_prim.f_param);
if (!desc.op_params) {
CLDNN_ERROR_MESSAGE(param_info.desc->id, "Invalid fused operation (" + param_info.desc->id + ") of type " +
param_info.desc->type_string());
}
if (!desc.op_params) {
CLDNN_ERROR_MESSAGE(param_info.desc->id, "Invalid fused operation (" + param_info.desc->id + ") of type " +
param_info.desc->type_string());
}
desc.dep_idx_start = fused_prim.dep_start_idx;
desc.dep_size = fused_prim.deps.size();
desc.op_id = op_id++;
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
desc.dep_idx_start = fused_prim.dep_start_idx;
desc.dep_size = fused_prim.deps.size();
desc.op_id = op_id++;
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
}
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
}
if (fused_prim.total_num_deps > 0) {
desc.dep_data.resize(fused_prim.total_num_deps);
for (auto& dep : fused_prim.fused_deps) {
auto iter = prim_id_type_map.find(dep.first);
if (iter != prim_id_type_map.end()) {
auto& op_data = iter->second;
desc.dep_data[dep.second].dep_type = kernel_selector::DepType::INTERNAL;
desc.dep_data[dep.second].op_id = op_data.first;
desc.dep_data[dep.second].data_type = op_data.second;
}
}
int idx = 0;
for (auto& dep : fused_prim.deps) {
desc.dep_data[dep.second].dep_type = kernel_selector::DepType::EXTERNAL;
desc.dep_data[dep.second].op_id = idx;
desc.dep_data[dep.second].data_type = desc.tensors[idx++].GetDType();
}
for (auto& dep : desc.dep_data) {
if (dep.dep_type == kernel_selector::DepType::UNDEFINED) {
dep.dep_type = kernel_selector::DepType::ORIGINAL;
break;
if (fused_prim.total_num_deps > 0) {
desc.dep_data.resize(fused_prim.total_num_deps);
for (auto& dep : fused_prim.fused_deps) {
auto iter = prim_id_type_map.find(dep.first);
if (iter != prim_id_type_map.end()) {
auto& op_data = iter->second;
desc.dep_data[dep.second].dep_type = kernel_selector::DepType::INTERNAL;
desc.dep_data[dep.second].op_id = op_data.first;
desc.dep_data[dep.second].data_type = op_data.second;
}
}
int idx = 0;
for (auto& dep : fused_prim.deps) {
desc.dep_data[dep.second].dep_type = kernel_selector::DepType::EXTERNAL;
desc.dep_data[dep.second].op_id = idx;
desc.dep_data[dep.second].data_type = desc.tensors[idx++].GetDType();
}
for (auto& dep : desc.dep_data) {
if (dep.dep_type == kernel_selector::DepType::UNDEFINED) {
dep.dep_type = kernel_selector::DepType::ORIGINAL;
break;
}
}
}
params.fused_ops.push_back(desc);
}
params.fused_ops.push_back(desc);
}
return params;
}

View File

@ -198,7 +198,6 @@ private:
void fuse_sigmoid_mul_to_swish(program &p);
void fuse_bias(program &p);
void fuse_reorders(program& p);
void fuse_activations(program& p);
void fuse_simple_primitives(program &p);
void optimize_fused_ops(program &p);
void remove_redundant_reshape(program &p);

View File

@ -91,8 +91,7 @@ public:
virtual std::unique_ptr<kernel_impl_params> get_kernel_impl_params(const std::vector<layout>& in_layouts, const std::vector<layout>& out_layouts) const {
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_primitive(), get_unique_id(), in_layouts, out_layouts,
get_fused_primitives(),
get_fused_activations_funcs(), get_fused_activations_params()));
get_fused_primitives()));
params->memory_deps = get_const_memory_deps();
auto deps = get_dependencies();
@ -245,33 +244,6 @@ public:
void unmark() { user_mark = 0; }
bool is_marked() const { return user_mark != 0; }
void add_fused_activation(activation_func activation_func,
activation_additional_params additional_params) {
fused_activations.emplace_back(activation_func, additional_params);
}
std::vector<activation_func> get_fused_activations_funcs() const {
std::vector<activation_func> funcs;
std::transform(fused_activations.begin(),
fused_activations.end(),
std::back_inserter(funcs),
[](fused_activation_params const& p) { return p.func; });
return funcs;
}
std::vector<activation_additional_params> get_fused_activations_params() const {
std::vector<activation_additional_params> params;
std::transform(fused_activations.begin(),
fused_activations.end(),
std::back_inserter(params),
[](fused_activation_params const& p) { return p.params; });
return params;
}
void copy_fused_activation(const program_node& rhs) {
fused_activations = rhs.fused_activations;
}
// check/set if the node can be optimized out (removed from the network)
bool can_be_optimized() const { return optimized; }
void can_be_optimized(bool opt) { optimized = opt; }
@ -435,18 +407,6 @@ protected:
const primitive_id org_id;
struct fused_activation_params {
activation_func func = activation_func::none;
activation_additional_params params = {0.0f, 0.0f};
fused_activation_params() {}
fused_activation_params(activation_func _func, activation_additional_params _params) :
func(_func),
params(_params) {}
};
std::vector<fused_activation_params> fused_activations;
std::vector<fused_primitive_desc> fused_prims;
void invalidate_users() const;

View File

@ -28,7 +28,7 @@ public:
}
bool is_in_place() const {
if (this->is_output() || !this->get_fused_activations_funcs().empty())
if (this->is_output() || this->has_fused_primitives())
return false;
return (!this->get_output_layout().data_padding && !input().get_output_layout(false).data_padding);
}

View File

@ -12,6 +12,24 @@
#include "intel_gpu/graph/serialization/string_serializer.hpp"
#include "intel_gpu/graph/serialization/vector_serializer.hpp"
#include "intel_gpu/primitives/concatenation.hpp"
#include "intel_gpu/primitives/convolution.hpp"
#include "intel_gpu/primitives/crop.hpp"
#include "intel_gpu/primitives/eltwise.hpp"
#include "intel_gpu/primitives/fully_connected.hpp"
#include "intel_gpu/primitives/normalize.hpp"
#include "intel_gpu/primitives/reorder.hpp"
#include "intel_gpu/primitives/reshape.hpp"
#include "intel_gpu/primitives/roi_pooling.hpp"
#include "intel_gpu/primitives/softmax.hpp"
#include "intel_gpu/primitives/depth_to_space.hpp"
#include "intel_gpu/primitives/shuffle_channels.hpp"
#include "intel_gpu/primitives/strided_slice.hpp"
#include "intel_gpu/primitives/cum_sum.hpp"
#include "intel_gpu/primitives/reverse_sequence.hpp"
#include "intel_gpu/primitives/embedding_bag.hpp"
#include "intel_gpu/primitives/extract_image_patches.hpp"
#include <string>
#include <vector>
@ -1008,6 +1026,54 @@ kernel_selector::activation_function get_kernel_selector_activation_param(activa
}
}
void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& activations) {
auto op_desc = param_info.fused_desc[0].typed_desc<activation>();
auto func = op_desc->activation_function;
auto params = op_desc->additional_params;
activations.push_back({get_kernel_selector_activation_param(func), params.a, params.b});
}
bool use_legacy_fused_ops(const kernel_impl_params& param_info) {
const auto& fused_ops = param_info.fused_desc;
if (fused_ops.size() != 1)
return false;
const auto& fused_op = fused_ops[0];
if (!fused_op.is_type<activation>())
return false;
if (!fused_op.deps.empty())
return false;
std::vector<primitive_type_id> legacy_fusion_list = {
concatenation::type_id(),
convolution::type_id(),
crop::type_id(),
eltwise::type_id(),
fully_connected::type_id(),
normalize::type_id(),
reorder::type_id(),
reshape::type_id(),
roi_pooling::type_id(),
softmax::type_id(),
depth_to_space::type_id(),
shuffle_channels::type_id(),
strided_slice::type_id(),
cum_sum::type_id(),
reverse_sequence::type_id(),
embedding_bag::type_id(),
extract_image_patches::type_id()
};
if (std::find(legacy_fusion_list.begin(), legacy_fusion_list.end(), param_info.desc->type) == legacy_fusion_list.end()) {
return false;
}
return true;
}
void set_params(const kernel_impl_params& param_info, kernel_selector::params& params) {
const auto& program = param_info.prog;
const auto& device_info = program->get_engine().get_device_info();

View File

@ -1159,14 +1159,6 @@ void program::fuse_nodes(program_node &fused_node,
local_desc.total_num_deps = peer_node.get_dependencies().size();
local_desc.input_layout = peer_node.get_dependency(0).get_output_layout();
local_desc.output_layout = peer_layout;
local_desc.activation = activation_func::none;
if (!peer_node.get_fused_activations_funcs().empty()) {
if (peer_node.get_fused_activations_funcs().size() > 1)
CLDNN_ERROR_MESSAGE(peer_node.id(), "Fused primitive descriptor doesn't support > 1 activation functions in a peer node");
local_desc.activation = peer_node.get_fused_activations_funcs()[0];
local_desc.activation_params = peer_node.get_fused_activations_params()[0];
}
auto fusedPadding = fused_node.get_output_layout().data_padding;
cldnn::padding needed_padding = padding::max(peer_layout.data_padding,

View File

@ -131,20 +131,6 @@ std::unique_ptr<json_composite> program_node::desc_to_json() const {
}
node_info->add("fused primitives", fused_nodes_info);
json_composite fused_activations;
auto fused_activations_funcs = get_fused_activations_funcs();
if (!fused_activations_funcs.empty()) {
for (size_t i = 0; i < fused_activations_funcs.size(); i++) {
json_composite fused_activation_info;
auto activation_type = activation_type_to_str(fused_activations_funcs[i]);
auto params = get_fused_activations_params()[i];
fused_activation_info.add("params", "a=" + std::to_string(params.a) + ", b=" + std::to_string(params.b));
fused_activation_info.add("activation", activation_type);
fused_activations.add("fused activation idx " + std::to_string(i), fused_activation_info);
}
node_info->add("fused activations (legacy)", fused_activations);
}
#ifdef ENABLE_ONEDNN_FOR_GPU
auto& onednn_post_ops = get_fused_primitives_onednn();
if (onednn_post_ops.size()) {
@ -1174,27 +1160,6 @@ void program_node::init_onednn_primitive_attributes() {
}
}
if (cldnn_post_ops.size() && get_fused_activations_funcs().size())
throw std::runtime_error("Unsupported mix of fused ops and activations");
for (size_t i = 0; i < get_fused_activations_funcs().size(); i++) {
auto activation_type = get_fused_activations_funcs()[i];
if (activation_type == cldnn::activation_func::hsigmoid) {
// Unsupported hsigmoid oneDNN gpu, splits hsigmoid activation min(max(val + 3, 0), 6) / 6
post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.f, 3.f);
post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, 0.f, 6.f);
post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1/6.f, 0.f);
update_onednn_post_op_list(onednn_post_op_type::eltwise_linear, empty_mem);
update_onednn_post_op_list(onednn_post_op_type::eltwise_clip, empty_mem);
update_onednn_post_op_list(onednn_post_op_type::eltwise_linear, empty_mem);
} else {
auto params = get_fused_activations_params()[i];
dnnl::algorithm alg = onednn::convert_activation_func(activation_type);
post_ops.append_eltwise(alg, params.a, params.b);
update_onednn_post_op_list(onednn_post_op_type::eltwise_act, empty_mem);
}
}
// Trying to optimize more than 1 post-ops
if (fused_ops.size() > 1) {
dnnl::post_ops optimized_post_ops = post_ops;

View File

@ -69,7 +69,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
__global INPUT0_TYPE* I,
__global OUTPUT_TYPE* O,
#if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
__read_only image2d_t U,
__read_only image2d_t U
#else
__global FILTER_TYPE* U
#endif