[GPU] Remove legacy activation fused ops from program (#15075)
* [GPU] Fix winograd kernel * [GPU] Remove fusion via legacy mechanism * [GPU] Conversion to legacy activations in kernel selector helper
This commit is contained in:
parent
65268d32df
commit
5b389860a1
@ -31,7 +31,6 @@ void handle_reshape::run(program& p) {
|
||||
auto output_lay = node.get_output_layout();
|
||||
|
||||
if (!node.is_in_place() ||
|
||||
!node.get_fused_activations_funcs().empty() ||
|
||||
node.has_fused_primitives())
|
||||
return;
|
||||
|
||||
@ -55,7 +54,7 @@ void handle_reshape::run(program& p) {
|
||||
while (node_itr != p.get_processing_order().end()) {
|
||||
auto& node = (*node_itr++);
|
||||
program_helpers::do_for_types<reshape>(*node, [&p](reshape_node& node) {
|
||||
if (node.is_output() || node.get_users().size() > 1 || !node.get_fused_activations_funcs().empty())
|
||||
if (node.is_output() || node.get_users().size() > 1 || node.has_fused_primitives())
|
||||
return;
|
||||
|
||||
auto& out_node = node.get_users().front();
|
||||
|
@ -306,14 +306,20 @@ void pre_replace_deconv::run(program& p) {
|
||||
auto pixel_shuffle_prim = std::make_shared<depth_to_space>(deconv_node_id, deconv_id_conv, 2, depth_to_space_mode::blocks_first);
|
||||
|
||||
program_node& pixel_shuffle_node = p.get_or_create(pixel_shuffle_prim);
|
||||
pixel_shuffle_node.add_fused_activation(activation_func::linear, { 1, bias });
|
||||
auto bias_id = deconv_node_id + "_bias";
|
||||
auto bias_prim = std::make_shared<activation>(bias_id,
|
||||
input_info(deconv_node_id),
|
||||
activation_func::linear,
|
||||
activation_additional_params{ 1, bias });
|
||||
program_node& bias_node = p.get_or_create(bias_prim);
|
||||
|
||||
// add connections input->convolution, weights->convolution
|
||||
// add connections input->depth_to_space, depth_to_space->bias
|
||||
p.add_connection(conv_node, pixel_shuffle_node);
|
||||
p.add_connection(pixel_shuffle_node, bias_node);
|
||||
|
||||
auto deconv_node_ptr = p.nodes_map.find(rename_id);
|
||||
if (deconv_node_ptr != p.nodes_map.end()) {
|
||||
p.replace_all_usages(*deconv_node_ptr->second, pixel_shuffle_node);
|
||||
p.replace_all_usages(*deconv_node_ptr->second, bias_node);
|
||||
p.optimized_out.push_back(rename_id);
|
||||
p.nodes_map.erase(rename_id);
|
||||
}
|
||||
|
@ -67,9 +67,7 @@ bool concat_noop_optimization::match(concatenation_node& node) {
|
||||
return false;
|
||||
if (node.is_dynamic())
|
||||
return false;
|
||||
return node.get_dependencies().size() == 1 &&
|
||||
!node.has_fused_primitives() &&
|
||||
node.get_fused_activations_funcs().empty();
|
||||
return node.get_dependencies().size() == 1 && !node.has_fused_primitives();
|
||||
}
|
||||
|
||||
bool concat_noop_optimization::optimize(concatenation_node& node) {
|
||||
@ -84,7 +82,7 @@ bool concat_noop_optimization::optimize(concatenation_node& node) {
|
||||
bool concat_in_place_optimization::match(concatenation_node& node) {
|
||||
if (node.is_output())
|
||||
return false;
|
||||
if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty())
|
||||
if (node.has_fused_primitives())
|
||||
return false;
|
||||
if (node.is_dynamic())
|
||||
return false;
|
||||
@ -300,7 +298,7 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st
|
||||
} // namespace
|
||||
|
||||
static bool can_reshape_be_optimized(const reshape_node& node) {
|
||||
return node.is_in_place() && node.get_fused_activations_funcs().empty();
|
||||
return node.is_in_place() && !node.has_fused_primitives();
|
||||
}
|
||||
|
||||
// ToDo remove friendship relation from program_node
|
||||
@ -322,11 +320,11 @@ void prepare_buffer_fusing::run(program& p) {
|
||||
// The condition below check only output layout as cases like
|
||||
// (dyn_shape) -> reshape -> (static_shape) -> some_static_primitive
|
||||
// may have invalid set_arguments call as output memory of reshape won't be available until reshape primitive is executed
|
||||
if (node->is_type<reshape>() && is_dynamic && is_planar && no_pad && !node->is_output() && node->get_fused_activations_funcs().empty()) {
|
||||
if (node->is_type<reshape>() && is_dynamic && is_planar && no_pad && !node->is_output() && !node->has_fused_primitives()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (node->is_dynamic() || node->is_output() || (!node->get_fused_activations_funcs().empty())) {
|
||||
if (node->is_dynamic() || node->is_output() || node->has_fused_primitives()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -58,7 +58,6 @@ void prepare_primitive_fusing::run(program& p) {
|
||||
fuse_sigmoid_mul_to_swish(p);
|
||||
fuse_bias(p);
|
||||
fuse_simple_primitives(p);
|
||||
fuse_activations(p);
|
||||
optimize_fused_ops(p);
|
||||
}
|
||||
|
||||
@ -226,105 +225,6 @@ void prepare_primitive_fusing::fuse_reorders(program &p) {
|
||||
}
|
||||
}
|
||||
|
||||
void prepare_primitive_fusing::fuse_activations(program &p) {
|
||||
std::map<primitive_id, std::vector<std::pair<primitive_id, size_t>>> fusing_history;
|
||||
bool use_onednn_impls = false;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
auto& engine = p.get_engine();
|
||||
if (engine.get_device_info().supports_immad && p.get_config().get_property(ov::intel_gpu::queue_type) == QueueTypes::in_order)
|
||||
use_onednn_impls = true;
|
||||
#endif
|
||||
|
||||
auto itr = p.get_processing_order().begin();
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
auto node_itr = itr++;
|
||||
auto& node = (*node_itr);
|
||||
|
||||
program_helpers::do_for_types<activation>(*node, [&p, &fusing_history, &use_onednn_impls](activation_node& node) {
|
||||
auto& input = node.input();
|
||||
auto id = node.id();
|
||||
// Restrictions:
|
||||
// - inputs cannot be padded
|
||||
// - primitives input cannot be output
|
||||
// - no activation additional input
|
||||
// - input was optimized
|
||||
// - can't have fused primitives
|
||||
if (node.has_padded_dependency() || input.is_output() || node.is_output() ||
|
||||
node.get_dependencies().size() != 1 || input.can_be_optimized() || node.is_constant() ||
|
||||
node.has_fused_primitives())
|
||||
return;
|
||||
|
||||
if (use_onednn_impls && node.get_primitive()->activation_function == cldnn::activation_func::hyperbolic_tan) {
|
||||
return;
|
||||
}
|
||||
|
||||
// - limit to primitives which implementations support activation fusing
|
||||
if (input.get_users().size() != 1 ||
|
||||
// TODO: new api needs to be created to read such caps
|
||||
// right now use whitelist so no new primitives will be affected in case of lack of fused activation
|
||||
// support
|
||||
(!input.is_type<concatenation>() && !input.is_type<convolution>() &&
|
||||
!input.is_type<crop>() && !input.is_type<deconvolution>() && !input.is_type<eltwise>() &&
|
||||
!input.is_type<fully_connected>() && !input.is_type<lrn>() && !input.is_type<normalize>() &&
|
||||
!input.is_type<permute>() && !input.is_type<pooling>() && !input.is_type<reorder>() &&
|
||||
!input.is_type<reshape>() && !input.is_type<roi_pooling>() &&
|
||||
!input.is_type<softmax>() && !input.is_type<resample>() && !input.is_type<mvn>() &&
|
||||
!input.is_type<depth_to_space>() && !input.is_type<batch_to_space>() &&
|
||||
!input.is_type<space_to_batch>() && !input.is_type<gather>() && !input.is_type<scatter_update>() && !input.is_type<shuffle_channels>() &&
|
||||
!input.is_type<scatter_nd_update>() &&
|
||||
!input.is_type<gather_nd>() &&
|
||||
!input.is_type<gather_elements>() &&
|
||||
!input.is_type<strided_slice>() && !input.is_type<cum_sum>() && !input.is_type<reverse_sequence>() &&
|
||||
!input.is_type<embedding_bag>() && !input.is_type<extract_image_patches>() &&
|
||||
!input.is_type<activation>()))
|
||||
return;
|
||||
|
||||
if (input.is_type<eltwise>()) {
|
||||
bool is_quantization = true;
|
||||
for (auto& in : input.get_dependencies()) {
|
||||
if (!data_type_traits::is_i8_u8(in.first->get_output_layout().data_type))
|
||||
is_quantization = false;
|
||||
}
|
||||
|
||||
// TODO: Add new fused ops mechanism support to eltwise kernel in order to enable fusings in case of quantization
|
||||
if (is_quantization)
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_onednn_impls) {
|
||||
if (input.is_type<reshape>() || input.is_type<concatenation>())
|
||||
return;
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
// Activation should not be fused if it isn't supported in onednn
|
||||
try {
|
||||
onednn::convert_activation_func(node.get_primitive()->activation_function);
|
||||
} catch (...) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (input.get_fused_primitives().empty()) {
|
||||
input.add_fused_activation(node.get_primitive()->activation_function, node.get_primitive()->additional_params);
|
||||
for (size_t i = 0; i < node.get_fused_activations_funcs().size(); i++) {
|
||||
input.add_fused_activation(node.get_fused_activations_funcs()[i],
|
||||
node.get_fused_activations_params()[i]);
|
||||
}
|
||||
auto outputPadding = node.get_output_layout().data_padding;
|
||||
input.set_output_padding(outputPadding);
|
||||
p.extract_and_remove(node);
|
||||
} else {
|
||||
// If node already has any fused node using new mechanism,
|
||||
// we can just use the same way and handle any amount of activations
|
||||
p.fuse_nodes(input, node, &fusing_history);
|
||||
}
|
||||
|
||||
p.add_optimized_primitive_info(id, {input.id()});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void prepare_primitive_fusing::fuse_bias(program &p) {
|
||||
auto itr = p.get_processing_order().begin();
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
@ -781,11 +681,11 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto& input_data = activation_node.get_dependency(0);
|
||||
auto& input = activation_node.get_dependency(0);
|
||||
if (activation_node.get_dependencies().size() >= 3)
|
||||
return;
|
||||
|
||||
if (!input_data_supports_fusings(input_data, activation_node.id()) || input_data.get_dependencies().empty())
|
||||
if (!input_data_supports_fusings(input, activation_node.id()) || input.get_dependencies().empty())
|
||||
return;
|
||||
|
||||
if (_lo.get_optimization_attributes().use_onednn_impls) {
|
||||
@ -799,58 +699,86 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool should_fuse = input_data.is_type<binary_convolution>();
|
||||
bool should_fuse = input.is_type<binary_convolution>();
|
||||
|
||||
should_fuse |= input_data.is_type<convolution>() && conv_supports_fusings(input_data.as<convolution>());
|
||||
should_fuse |= input.is_type<convolution>() && conv_supports_fusings(input.as<convolution>());
|
||||
|
||||
should_fuse |= input_data.is_type<fully_connected>() && fc_supports_fusings(input_data.as<fully_connected>());
|
||||
should_fuse |= input.is_type<fully_connected>() && fc_supports_fusings(input.as<fully_connected>());
|
||||
|
||||
should_fuse |= input_data.is_type<gemm>() && gemm_supports_fusings(input_data.as<gemm>());
|
||||
should_fuse |= input.is_type<gemm>() && gemm_supports_fusings(input.as<gemm>());
|
||||
|
||||
should_fuse |= input_data.is_type<pooling>();
|
||||
should_fuse |= input.is_type<pooling>();
|
||||
|
||||
should_fuse |= input_data.is_type<resample>();
|
||||
should_fuse |= input.is_type<resample>();
|
||||
|
||||
should_fuse |= input_data.is_type<mvn>();
|
||||
should_fuse |= input.is_type<mvn>();
|
||||
|
||||
should_fuse |= input_data.is_type<normalize>() && data_type_traits::is_i8_u8(input_data.get_dependency(0).get_output_layout().data_type);
|
||||
should_fuse |= input.is_type<normalize>() && data_type_traits::is_i8_u8(input.get_dependency(0).get_output_layout().data_type);
|
||||
|
||||
should_fuse |= input_data.is_type<deconvolution>();
|
||||
should_fuse |= input.is_type<deconvolution>();
|
||||
|
||||
should_fuse |= input_data.is_type<permute>();
|
||||
should_fuse |= input.is_type<permute>();
|
||||
|
||||
should_fuse |= input_data.is_type<activation>();
|
||||
should_fuse |= input.is_type<activation>();
|
||||
|
||||
should_fuse |= input_data.is_type<lrn>();
|
||||
should_fuse |= input.is_type<lrn>();
|
||||
|
||||
should_fuse |= input_data.is_type<gather>();
|
||||
should_fuse |= input.is_type<gather>();
|
||||
|
||||
should_fuse |= input_data.is_type<gather_nd>();
|
||||
should_fuse |= input.is_type<gather_nd>();
|
||||
|
||||
should_fuse |= input_data.is_type<gather_elements>();
|
||||
should_fuse |= input.is_type<gather_elements>();
|
||||
|
||||
should_fuse |= input_data.is_type<scatter_update>();
|
||||
should_fuse |= input.is_type<scatter_update>();
|
||||
|
||||
should_fuse |= input_data.is_type<scatter_nd_update>();
|
||||
should_fuse |= input.is_type<scatter_nd_update>();
|
||||
|
||||
should_fuse |= input_data.is_type<scatter_elements_update>();
|
||||
should_fuse |= input.is_type<scatter_elements_update>();
|
||||
|
||||
should_fuse |= input_data.is_type<depth_to_space>();
|
||||
should_fuse |= input.is_type<depth_to_space>();
|
||||
|
||||
should_fuse |= input_data.is_type<space_to_depth>();
|
||||
should_fuse |= input.is_type<space_to_depth>();
|
||||
|
||||
should_fuse |= input_data.is_type<batch_to_space>();
|
||||
should_fuse |= input.is_type<batch_to_space>();
|
||||
|
||||
should_fuse |= input_data.is_type<space_to_batch>();
|
||||
should_fuse |= input.is_type<space_to_batch>();
|
||||
|
||||
should_fuse |= input_data.is_type<reduce>() && reduce_supports_fusings(input_data.as<reduce>());
|
||||
should_fuse |= input.is_type<reduce>() && reduce_supports_fusings(input.as<reduce>());
|
||||
|
||||
should_fuse |= input_data.is_type<eltwise>() && eltwise_supports_fusings(input_data.as<eltwise>());
|
||||
should_fuse |= input.is_type<eltwise>() && eltwise_supports_fusings(input.as<eltwise>());
|
||||
|
||||
bool legacy_fusion = activation_node.get_dependencies().size() == 1 &&
|
||||
!input.can_be_optimized() &&
|
||||
!activation_node.is_constant() &&
|
||||
!activation_node.has_fused_primitives() &&
|
||||
(input.is_type<concatenation>() ||
|
||||
input.is_type<convolution>() ||
|
||||
input.is_type<crop>() ||
|
||||
input.is_type<eltwise>() ||
|
||||
input.is_type<fully_connected>() ||
|
||||
input.is_type<normalize>() ||
|
||||
input.is_type<reorder>() ||
|
||||
input.is_type<reshape>() ||
|
||||
input.is_type<roi_pooling>() ||
|
||||
input.is_type<softmax>() ||
|
||||
input.is_type<depth_to_space>() ||
|
||||
input.is_type<shuffle_channels>() ||
|
||||
input.is_type<strided_slice>() ||
|
||||
input.is_type<cum_sum>() ||
|
||||
input.is_type<reverse_sequence>() ||
|
||||
input.is_type<embedding_bag>() ||
|
||||
input.is_type<extract_image_patches>());
|
||||
|
||||
if (!should_fuse && legacy_fusion) {
|
||||
GPU_DEBUG_LOG << activation_node.id() << " is fused by legacy conditions! Consider adding selected kernel with fused ops support\n";
|
||||
}
|
||||
|
||||
should_fuse |= legacy_fusion;
|
||||
|
||||
if (!should_fuse)
|
||||
return;
|
||||
|
||||
p.fuse_nodes(input_data, activation_node, &fusing_history);
|
||||
p.fuse_nodes(input, activation_node, &fusing_history);
|
||||
};
|
||||
|
||||
auto fuse_quantize_f = [&](quantize_node& quantize_node) {
|
||||
|
@ -61,7 +61,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
|
||||
continue;
|
||||
|
||||
if (!node.get_fused_activations_funcs().empty())
|
||||
if (node.has_fused_primitives())
|
||||
continue;
|
||||
|
||||
std::function<bool(program_node&)> has_quantize_user;
|
||||
@ -149,7 +149,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
!r_dep_node.has_mean() &&
|
||||
r_dep_node.get_primitive()->subtract_per_feature.empty() &&
|
||||
!r_dep_node.is_output() &&
|
||||
r_dep_node.get_fused_activations_funcs().empty() &&
|
||||
!r_dep_node.has_fused_primitives() &&
|
||||
!r_dep_node.get_primitive()->has_surface_input();
|
||||
|
||||
// for chains like
|
||||
@ -165,7 +165,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
!r_dep_node.is_output() &&
|
||||
!r_node.has_mean() &&
|
||||
r_node.get_primitive()->subtract_per_feature.empty() &&
|
||||
r_node.get_fused_activations_funcs().empty() &&
|
||||
!r_node.has_fused_primitives() &&
|
||||
!r_node.get_primitive()->has_surface_input();
|
||||
|
||||
if (remove_dep) {
|
||||
@ -205,7 +205,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
r_node.has_mean() ||
|
||||
r_node.get_users().size() > 1 ||
|
||||
r_node.get_primitive()->subtract_per_feature.size() ||
|
||||
r_node.get_fused_activations_funcs().size())
|
||||
r_node.has_fused_primitives())
|
||||
continue;
|
||||
|
||||
if (!r_node.get_users().front()->is_type<concatenation>())
|
||||
@ -258,7 +258,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
if (r_node.has_mean() ||
|
||||
!r_node.get_primitive()->subtract_per_feature.empty() ||
|
||||
no_output_optimization ||
|
||||
!r_node.get_fused_activations_funcs().empty() ||
|
||||
r_node.has_fused_primitives() ||
|
||||
r_node.get_primitive()->has_surface_input())
|
||||
continue;
|
||||
|
||||
@ -335,7 +335,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
if (user->is_type<reorder>() &&
|
||||
user != node &&
|
||||
!user->is_output() &&
|
||||
user->get_fused_activations_funcs().empty()) {
|
||||
!user->has_fused_primitives()) {
|
||||
auto l1 = node->get_output_layout();
|
||||
auto l2 = user->get_output_layout();
|
||||
|
||||
@ -382,7 +382,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
|
||||
continue;
|
||||
|
||||
if (!node.get_fused_activations_funcs().empty())
|
||||
if (node.has_fused_primitives())
|
||||
continue;
|
||||
|
||||
if (input.get_users().size() != 1)
|
||||
@ -530,7 +530,6 @@ void remove_redundant_reorders::run(program& p) {
|
||||
local_desc.f_param = node->get_fuse_params();
|
||||
local_desc.dep_start_idx = input.get_fused_primitives().size();
|
||||
local_desc.output_layout = output_layout;
|
||||
local_desc.activation = activation_func::none;
|
||||
input.add_fused_primitive(local_desc);
|
||||
|
||||
// remove reorder node
|
||||
@ -561,8 +560,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
!r_node.get_primitive()->subtract_per_feature.empty())
|
||||
continue;
|
||||
|
||||
if (!r_node.get_fused_activations_funcs().empty() ||
|
||||
!r_node.get_fused_primitives().empty())
|
||||
if (r_node.has_fused_primitives())
|
||||
continue;
|
||||
|
||||
// Remove reorder for Convolution bfyx -> fs_b_yx_fsv32
|
||||
@ -596,10 +594,10 @@ void remove_redundant_reorders::run(program& p) {
|
||||
auto& reshape_input_node = dep_node.as<reshape>();
|
||||
|
||||
bool remove_dep = reshape_input_node.get_users().size() == 1 && !reshape_input_node.is_output() &&
|
||||
reshape_input_node.get_fused_activations_funcs().empty() && reshape_input_node.get_fused_primitives().empty();
|
||||
!reshape_input_node.has_fused_primitives();
|
||||
bool remove_current = remove_dep && !reshape_input_node.get_dependencies().empty() &&
|
||||
reshape_input_node.get_dependency(0).get_output_layout() == reshape_node.get_output_layout() &&
|
||||
reshape_node.get_fused_activations_funcs().empty() && reshape_node.get_fused_primitives().empty();
|
||||
reshape_node.has_fused_primitives();
|
||||
|
||||
if (remove_dep) {
|
||||
LOG_NODE_REMOVAL(reshape_input_node.id());
|
||||
|
@ -10,6 +10,14 @@
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "primitive_base.hpp"
|
||||
|
||||
namespace {
|
||||
inline void convert_new_activation_func(const activation& prim, std::vector<kernel_selector::base_activation_params>& params) {
|
||||
params.insert(params.begin(), {get_kernel_selector_activation_param(prim.activation_function),
|
||||
prim.additional_params.a,
|
||||
prim.additional_params.b});
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace cldnn {
|
||||
namespace ocl {
|
||||
|
||||
@ -40,7 +48,7 @@ struct activation_impl : typed_primitive_impl_ocl<activation> {
|
||||
auto params = get_default_params<kernel_selector::activation_params>(impl_param);
|
||||
auto optional_params = get_default_optional_params<kernel_selector::activation_optional_params>(impl_param.get_program());
|
||||
|
||||
convert_new_activation_func(primitive, params.activations);
|
||||
convert_new_activation_func(*primitive, params.activations);
|
||||
|
||||
bool is_parameterized = !primitive->additional_params_input.empty();
|
||||
if (is_parameterized) {
|
||||
|
@ -42,9 +42,6 @@ struct fused_primitive_desc {
|
||||
std::map<primitive_id, size_t> fused_deps;
|
||||
size_t dep_start_idx;
|
||||
size_t total_num_deps = 0;
|
||||
|
||||
activation_func activation;
|
||||
activation_additional_params activation_params = { 0.f, 0.f };
|
||||
};
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
|
@ -119,8 +119,6 @@ struct kernel_impl_params {
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
std::vector<cldnn::fused_primitive_desc_onednn> fused_desc_onednn;
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
std::vector<activation_func> fused_act_funcs;
|
||||
std::vector<activation_additional_params> activation_params;
|
||||
|
||||
optional_layout weights_layout = optional_layout();
|
||||
|
||||
@ -141,9 +139,7 @@ struct kernel_impl_params {
|
||||
size_t _uid,
|
||||
const std::vector<layout>& _in_layouts,
|
||||
const std::vector<layout>& _out_layouts,
|
||||
const std::vector<cldnn::fused_primitive_desc>& _fused_descs,
|
||||
const std::vector<activation_func>& _fused_act_funcs,
|
||||
const std::vector<activation_additional_params>& _act_params)
|
||||
const std::vector<cldnn::fused_primitive_desc>& _fused_descs)
|
||||
: has_runtime_layouts(true)
|
||||
, prog(&_prog)
|
||||
, desc(_desc)
|
||||
@ -151,8 +147,6 @@ struct kernel_impl_params {
|
||||
, input_layouts(_in_layouts)
|
||||
, output_layouts(_out_layouts)
|
||||
, fused_desc(_fused_descs)
|
||||
, fused_act_funcs(_fused_act_funcs)
|
||||
, activation_params(_act_params)
|
||||
, primary_input_idx(0) {
|
||||
}
|
||||
|
||||
@ -208,31 +202,8 @@ kernel_selector::dim_tensor<T> convert_dim_vector(const tensor& t) {
|
||||
static_cast<T>(sizes[5])};
|
||||
}
|
||||
|
||||
template <typename p_type>
|
||||
inline void convert_activation_func_params(const p_type primitive, std::vector<kernel_selector::base_activation_params>& params) {
|
||||
const float negative_slope = primitive->activation_negative_slope;
|
||||
if (negative_slope != 0.0f) {
|
||||
params.emplace_back(kernel_selector::activation_function::RELU_NEGATIVE_SLOPE, negative_slope, 0.0f);
|
||||
} else {
|
||||
params.emplace_back(kernel_selector::activation_function::RELU, 0.0f, 0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
inline void convert_fused_activation_func_params(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& params) {
|
||||
const auto& act_funcs = param_info.fused_act_funcs;
|
||||
const auto& act_params = param_info.activation_params;
|
||||
for (size_t i = 0; i < act_funcs.size(); i++) {
|
||||
params.emplace_back(get_kernel_selector_activation_param(act_funcs[i]),
|
||||
act_params[i].a,
|
||||
act_params[i].b);
|
||||
}
|
||||
}
|
||||
template <typename p_type>
|
||||
inline void convert_new_activation_func(const p_type primitive, std::vector<kernel_selector::base_activation_params>& params) {
|
||||
params.insert(params.begin(), {get_kernel_selector_activation_param(primitive->activation_function),
|
||||
primitive->additional_params.a,
|
||||
primitive->additional_params.b});
|
||||
}
|
||||
void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& activations);
|
||||
bool use_legacy_fused_ops(const kernel_impl_params& param_info);
|
||||
|
||||
void set_params(const kernel_impl_params& param_info, kernel_selector::params& params);
|
||||
|
||||
@ -249,56 +220,62 @@ inline params_t get_default_params(const kernel_impl_params& param_info) {
|
||||
params.outputs[0] = convert_data_tensor(output_layout);
|
||||
params.layerID = param_info.desc->id;
|
||||
|
||||
convert_fused_activation_func_params(param_info, params.activations);
|
||||
std::map<primitive_id, std::pair<size_t, kernel_selector::Datatype>> prim_id_type_map;
|
||||
size_t op_id = 0;
|
||||
for (auto& fused_prim : param_info.fused_desc) {
|
||||
kernel_selector::fused_operation_desc desc;
|
||||
desc.op_params = std::move(fused_prim.f_param);
|
||||
if (use_legacy_fused_ops(param_info)) {
|
||||
// Single activation is converted to legacy fused ops format to keep good performance
|
||||
// TODO: Remove it once all kernels supports new fused ops mechanism
|
||||
convert_fused_ops_to_legacy_activations(param_info, params.activations);
|
||||
} else {
|
||||
std::map<primitive_id, std::pair<size_t, kernel_selector::Datatype>> prim_id_type_map;
|
||||
size_t op_id = 0;
|
||||
for (auto& fused_prim : param_info.fused_desc) {
|
||||
kernel_selector::fused_operation_desc desc;
|
||||
desc.op_params = std::move(fused_prim.f_param);
|
||||
|
||||
if (!desc.op_params) {
|
||||
CLDNN_ERROR_MESSAGE(param_info.desc->id, "Invalid fused operation (" + param_info.desc->id + ") of type " +
|
||||
param_info.desc->type_string());
|
||||
}
|
||||
if (!desc.op_params) {
|
||||
CLDNN_ERROR_MESSAGE(param_info.desc->id, "Invalid fused operation (" + param_info.desc->id + ") of type " +
|
||||
param_info.desc->type_string());
|
||||
}
|
||||
|
||||
desc.dep_idx_start = fused_prim.dep_start_idx;
|
||||
desc.dep_size = fused_prim.deps.size();
|
||||
desc.op_id = op_id++;
|
||||
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
|
||||
prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
|
||||
desc.dep_idx_start = fused_prim.dep_start_idx;
|
||||
desc.dep_size = fused_prim.deps.size();
|
||||
desc.op_id = op_id++;
|
||||
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
|
||||
prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
|
||||
|
||||
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
|
||||
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
|
||||
}
|
||||
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
|
||||
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
|
||||
}
|
||||
|
||||
if (fused_prim.total_num_deps > 0) {
|
||||
desc.dep_data.resize(fused_prim.total_num_deps);
|
||||
for (auto& dep : fused_prim.fused_deps) {
|
||||
auto iter = prim_id_type_map.find(dep.first);
|
||||
if (iter != prim_id_type_map.end()) {
|
||||
auto& op_data = iter->second;
|
||||
desc.dep_data[dep.second].dep_type = kernel_selector::DepType::INTERNAL;
|
||||
desc.dep_data[dep.second].op_id = op_data.first;
|
||||
desc.dep_data[dep.second].data_type = op_data.second;
|
||||
}
|
||||
}
|
||||
|
||||
int idx = 0;
|
||||
for (auto& dep : fused_prim.deps) {
|
||||
desc.dep_data[dep.second].dep_type = kernel_selector::DepType::EXTERNAL;
|
||||
desc.dep_data[dep.second].op_id = idx;
|
||||
desc.dep_data[dep.second].data_type = desc.tensors[idx++].GetDType();
|
||||
}
|
||||
|
||||
for (auto& dep : desc.dep_data) {
|
||||
if (dep.dep_type == kernel_selector::DepType::UNDEFINED) {
|
||||
dep.dep_type = kernel_selector::DepType::ORIGINAL;
|
||||
break;
|
||||
if (fused_prim.total_num_deps > 0) {
|
||||
desc.dep_data.resize(fused_prim.total_num_deps);
|
||||
for (auto& dep : fused_prim.fused_deps) {
|
||||
auto iter = prim_id_type_map.find(dep.first);
|
||||
if (iter != prim_id_type_map.end()) {
|
||||
auto& op_data = iter->second;
|
||||
desc.dep_data[dep.second].dep_type = kernel_selector::DepType::INTERNAL;
|
||||
desc.dep_data[dep.second].op_id = op_data.first;
|
||||
desc.dep_data[dep.second].data_type = op_data.second;
|
||||
}
|
||||
}
|
||||
|
||||
int idx = 0;
|
||||
for (auto& dep : fused_prim.deps) {
|
||||
desc.dep_data[dep.second].dep_type = kernel_selector::DepType::EXTERNAL;
|
||||
desc.dep_data[dep.second].op_id = idx;
|
||||
desc.dep_data[dep.second].data_type = desc.tensors[idx++].GetDType();
|
||||
}
|
||||
|
||||
for (auto& dep : desc.dep_data) {
|
||||
if (dep.dep_type == kernel_selector::DepType::UNDEFINED) {
|
||||
dep.dep_type = kernel_selector::DepType::ORIGINAL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
params.fused_ops.push_back(desc);
|
||||
}
|
||||
params.fused_ops.push_back(desc);
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
|
@ -198,7 +198,6 @@ private:
|
||||
void fuse_sigmoid_mul_to_swish(program &p);
|
||||
void fuse_bias(program &p);
|
||||
void fuse_reorders(program& p);
|
||||
void fuse_activations(program& p);
|
||||
void fuse_simple_primitives(program &p);
|
||||
void optimize_fused_ops(program &p);
|
||||
void remove_redundant_reshape(program &p);
|
||||
|
@ -91,8 +91,7 @@ public:
|
||||
|
||||
virtual std::unique_ptr<kernel_impl_params> get_kernel_impl_params(const std::vector<layout>& in_layouts, const std::vector<layout>& out_layouts) const {
|
||||
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_primitive(), get_unique_id(), in_layouts, out_layouts,
|
||||
get_fused_primitives(),
|
||||
get_fused_activations_funcs(), get_fused_activations_params()));
|
||||
get_fused_primitives()));
|
||||
params->memory_deps = get_const_memory_deps();
|
||||
|
||||
auto deps = get_dependencies();
|
||||
@ -245,33 +244,6 @@ public:
|
||||
void unmark() { user_mark = 0; }
|
||||
bool is_marked() const { return user_mark != 0; }
|
||||
|
||||
void add_fused_activation(activation_func activation_func,
|
||||
activation_additional_params additional_params) {
|
||||
fused_activations.emplace_back(activation_func, additional_params);
|
||||
}
|
||||
|
||||
std::vector<activation_func> get_fused_activations_funcs() const {
|
||||
std::vector<activation_func> funcs;
|
||||
std::transform(fused_activations.begin(),
|
||||
fused_activations.end(),
|
||||
std::back_inserter(funcs),
|
||||
[](fused_activation_params const& p) { return p.func; });
|
||||
return funcs;
|
||||
}
|
||||
|
||||
std::vector<activation_additional_params> get_fused_activations_params() const {
|
||||
std::vector<activation_additional_params> params;
|
||||
std::transform(fused_activations.begin(),
|
||||
fused_activations.end(),
|
||||
std::back_inserter(params),
|
||||
[](fused_activation_params const& p) { return p.params; });
|
||||
return params;
|
||||
}
|
||||
|
||||
void copy_fused_activation(const program_node& rhs) {
|
||||
fused_activations = rhs.fused_activations;
|
||||
}
|
||||
|
||||
// check/set if the node can be optimized out (removed from the network)
|
||||
bool can_be_optimized() const { return optimized; }
|
||||
void can_be_optimized(bool opt) { optimized = opt; }
|
||||
@ -435,18 +407,6 @@ protected:
|
||||
|
||||
const primitive_id org_id;
|
||||
|
||||
struct fused_activation_params {
|
||||
activation_func func = activation_func::none;
|
||||
activation_additional_params params = {0.0f, 0.0f};
|
||||
|
||||
fused_activation_params() {}
|
||||
|
||||
fused_activation_params(activation_func _func, activation_additional_params _params) :
|
||||
func(_func),
|
||||
params(_params) {}
|
||||
};
|
||||
|
||||
std::vector<fused_activation_params> fused_activations;
|
||||
std::vector<fused_primitive_desc> fused_prims;
|
||||
|
||||
void invalidate_users() const;
|
||||
|
@ -28,7 +28,7 @@ public:
|
||||
}
|
||||
|
||||
bool is_in_place() const {
|
||||
if (this->is_output() || !this->get_fused_activations_funcs().empty())
|
||||
if (this->is_output() || this->has_fused_primitives())
|
||||
return false;
|
||||
return (!this->get_output_layout().data_padding && !input().get_output_layout(false).data_padding);
|
||||
}
|
||||
|
@ -12,6 +12,24 @@
|
||||
#include "intel_gpu/graph/serialization/string_serializer.hpp"
|
||||
#include "intel_gpu/graph/serialization/vector_serializer.hpp"
|
||||
|
||||
#include "intel_gpu/primitives/concatenation.hpp"
|
||||
#include "intel_gpu/primitives/convolution.hpp"
|
||||
#include "intel_gpu/primitives/crop.hpp"
|
||||
#include "intel_gpu/primitives/eltwise.hpp"
|
||||
#include "intel_gpu/primitives/fully_connected.hpp"
|
||||
#include "intel_gpu/primitives/normalize.hpp"
|
||||
#include "intel_gpu/primitives/reorder.hpp"
|
||||
#include "intel_gpu/primitives/reshape.hpp"
|
||||
#include "intel_gpu/primitives/roi_pooling.hpp"
|
||||
#include "intel_gpu/primitives/softmax.hpp"
|
||||
#include "intel_gpu/primitives/depth_to_space.hpp"
|
||||
#include "intel_gpu/primitives/shuffle_channels.hpp"
|
||||
#include "intel_gpu/primitives/strided_slice.hpp"
|
||||
#include "intel_gpu/primitives/cum_sum.hpp"
|
||||
#include "intel_gpu/primitives/reverse_sequence.hpp"
|
||||
#include "intel_gpu/primitives/embedding_bag.hpp"
|
||||
#include "intel_gpu/primitives/extract_image_patches.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@ -1008,6 +1026,54 @@ kernel_selector::activation_function get_kernel_selector_activation_param(activa
|
||||
}
|
||||
}
|
||||
|
||||
void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& activations) {
|
||||
auto op_desc = param_info.fused_desc[0].typed_desc<activation>();
|
||||
auto func = op_desc->activation_function;
|
||||
auto params = op_desc->additional_params;
|
||||
|
||||
activations.push_back({get_kernel_selector_activation_param(func), params.a, params.b});
|
||||
}
|
||||
|
||||
bool use_legacy_fused_ops(const kernel_impl_params& param_info) {
|
||||
const auto& fused_ops = param_info.fused_desc;
|
||||
if (fused_ops.size() != 1)
|
||||
return false;
|
||||
|
||||
const auto& fused_op = fused_ops[0];
|
||||
if (!fused_op.is_type<activation>())
|
||||
return false;
|
||||
|
||||
if (!fused_op.deps.empty())
|
||||
return false;
|
||||
|
||||
|
||||
std::vector<primitive_type_id> legacy_fusion_list = {
|
||||
concatenation::type_id(),
|
||||
convolution::type_id(),
|
||||
crop::type_id(),
|
||||
eltwise::type_id(),
|
||||
fully_connected::type_id(),
|
||||
normalize::type_id(),
|
||||
reorder::type_id(),
|
||||
reshape::type_id(),
|
||||
roi_pooling::type_id(),
|
||||
softmax::type_id(),
|
||||
depth_to_space::type_id(),
|
||||
shuffle_channels::type_id(),
|
||||
strided_slice::type_id(),
|
||||
cum_sum::type_id(),
|
||||
reverse_sequence::type_id(),
|
||||
embedding_bag::type_id(),
|
||||
extract_image_patches::type_id()
|
||||
};
|
||||
|
||||
if (std::find(legacy_fusion_list.begin(), legacy_fusion_list.end(), param_info.desc->type) == legacy_fusion_list.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_params(const kernel_impl_params& param_info, kernel_selector::params& params) {
|
||||
const auto& program = param_info.prog;
|
||||
const auto& device_info = program->get_engine().get_device_info();
|
||||
|
@ -1159,14 +1159,6 @@ void program::fuse_nodes(program_node &fused_node,
|
||||
local_desc.total_num_deps = peer_node.get_dependencies().size();
|
||||
local_desc.input_layout = peer_node.get_dependency(0).get_output_layout();
|
||||
local_desc.output_layout = peer_layout;
|
||||
local_desc.activation = activation_func::none;
|
||||
if (!peer_node.get_fused_activations_funcs().empty()) {
|
||||
if (peer_node.get_fused_activations_funcs().size() > 1)
|
||||
CLDNN_ERROR_MESSAGE(peer_node.id(), "Fused primitive descriptor doesn't support > 1 activation functions in a peer node");
|
||||
|
||||
local_desc.activation = peer_node.get_fused_activations_funcs()[0];
|
||||
local_desc.activation_params = peer_node.get_fused_activations_params()[0];
|
||||
}
|
||||
|
||||
auto fusedPadding = fused_node.get_output_layout().data_padding;
|
||||
cldnn::padding needed_padding = padding::max(peer_layout.data_padding,
|
||||
|
@ -131,20 +131,6 @@ std::unique_ptr<json_composite> program_node::desc_to_json() const {
|
||||
}
|
||||
node_info->add("fused primitives", fused_nodes_info);
|
||||
|
||||
json_composite fused_activations;
|
||||
auto fused_activations_funcs = get_fused_activations_funcs();
|
||||
if (!fused_activations_funcs.empty()) {
|
||||
for (size_t i = 0; i < fused_activations_funcs.size(); i++) {
|
||||
json_composite fused_activation_info;
|
||||
auto activation_type = activation_type_to_str(fused_activations_funcs[i]);
|
||||
auto params = get_fused_activations_params()[i];
|
||||
fused_activation_info.add("params", "a=" + std::to_string(params.a) + ", b=" + std::to_string(params.b));
|
||||
fused_activation_info.add("activation", activation_type);
|
||||
fused_activations.add("fused activation idx " + std::to_string(i), fused_activation_info);
|
||||
}
|
||||
node_info->add("fused activations (legacy)", fused_activations);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
auto& onednn_post_ops = get_fused_primitives_onednn();
|
||||
if (onednn_post_ops.size()) {
|
||||
@ -1174,27 +1160,6 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
}
|
||||
}
|
||||
|
||||
if (cldnn_post_ops.size() && get_fused_activations_funcs().size())
|
||||
throw std::runtime_error("Unsupported mix of fused ops and activations");
|
||||
|
||||
for (size_t i = 0; i < get_fused_activations_funcs().size(); i++) {
|
||||
auto activation_type = get_fused_activations_funcs()[i];
|
||||
if (activation_type == cldnn::activation_func::hsigmoid) {
|
||||
// Unsupported hsigmoid oneDNN gpu, splits hsigmoid activation min(max(val + 3, 0), 6) / 6
|
||||
post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.f, 3.f);
|
||||
post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, 0.f, 6.f);
|
||||
post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1/6.f, 0.f);
|
||||
update_onednn_post_op_list(onednn_post_op_type::eltwise_linear, empty_mem);
|
||||
update_onednn_post_op_list(onednn_post_op_type::eltwise_clip, empty_mem);
|
||||
update_onednn_post_op_list(onednn_post_op_type::eltwise_linear, empty_mem);
|
||||
} else {
|
||||
auto params = get_fused_activations_params()[i];
|
||||
dnnl::algorithm alg = onednn::convert_activation_func(activation_type);
|
||||
post_ops.append_eltwise(alg, params.a, params.b);
|
||||
update_onednn_post_op_list(onednn_post_op_type::eltwise_act, empty_mem);
|
||||
}
|
||||
}
|
||||
|
||||
// Trying to optimize more than 1 post-ops
|
||||
if (fused_ops.size() > 1) {
|
||||
dnnl::post_ops optimized_post_ops = post_ops;
|
||||
|
@ -69,7 +69,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
__global INPUT0_TYPE* I,
|
||||
__global OUTPUT_TYPE* O,
|
||||
#if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
|
||||
__read_only image2d_t U,
|
||||
__read_only image2d_t U
|
||||
#else
|
||||
__global FILTER_TYPE* U
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user