diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp index 97b26c71b28..f316a73dd3f 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp @@ -378,6 +378,16 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { return false; }; + auto eltwise_supports_fusings = [&](eltwise_node& node) -> bool { + auto out_layout = node.get_output_layout(); + if (out_layout.data_type == data_types::f16 && out_layout.size.batch[0] > 1 && + (_lo.get_optimization_attributes().fs_b_yx_fsv32_network || out_layout.format == format::fs_b_yx_fsv32)) { + return false; + } + + return true; + }; + auto fuse_activation_f = [&](activation_node& activation_node) { auto& input_data = activation_node.get_dependency(0); if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3) @@ -425,7 +435,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { should_fuse |= input_data.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input_data.is_type() && eltwise_supports_fusings(input_data.as()); if (!should_fuse) return; @@ -484,7 +494,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { should_fuse |= input_data.is_type(); - should_fuse |= input_data.is_type(); + should_fuse |= input_data.is_type() && eltwise_supports_fusings(input_data.as()); if (!should_fuse) return; @@ -574,7 +584,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { reduce_supports_fusings(input_data.as()) && quantize_node.get_scale_shift_opt(); - should_fuse |= input_data.is_type() && quantize_node.get_scale_shift_opt(); + should_fuse |= input_data.is_type() && eltwise_supports_fusings(input_data.as()) && quantize_node.get_scale_shift_opt(); should_fuse |= input_data.is_type() && quantize_node.get_scale_shift_opt(); @@ -610,7 +620,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { (parents[i]->is_type() && gemm_supports_fusings(parents[i]->as())) || (parents[i]->is_type()) || (parents[i]->is_type()) || - (parents[i]->is_type()) || + (parents[i]->is_type() && eltwise_supports_fusings(parents[i]->as())) || (parents[i]->is_type()) || (parents[i]->is_type() && dts_supports_fusings(parents[i]->as())) || (parents[i]->is_type() && reduce_supports_fusings(parents[i]->as())); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index da8847f783c..e76f85d9989 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -6231,6 +6231,7 @@ struct eltwise_test_params { #define CASE_ELTWISE_FP16_1 {2, 16, 4, 4}, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum #define CASE_ELTWISE_FP16_2 {2, 16, 4, 4}, data_types::f16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum #define CASE_ELTWISE_FP16_3 {2, 32, 4, 8}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_FP16_4 {3, 32, 4, 4}, data_types::f16, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::fs_b_yx_fsv32, eltwise_mode::sum #define CASE_ELTWISE_I8_1 {2, 16, 4, 4}, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum #define CASE_ELTWISE_I8_2 {2, 16, 4, 4}, data_types::i8, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum #define CASE_ELTWISE_I8_3 {2, 16, 4, 4}, data_types::i8, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum @@ -6386,6 +6387,49 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, eltwise_test_params{CASE_ELTWISE_FP32_3, 3, 5}, }), ); +class eltwise_fp32_fsv32 : public EltwiseFusingTest {}; +TEST_P(eltwise_fp32_fsv32, add) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), + eltwise("eltwise", {"input", "input2"}, p.mode, p.default_type), + eltwise("add", {"eltwise", "add_data"}, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32)); + + implementation_desc eltw_impl = { format::fs_b_yx_fsv32, "eltwise_fs_b_yx_fsv32" }; + bo_fused.set_option(build_option::force_implementations({ {"eltwise", eltw_impl} })); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(eltwise_fp32_fsv32, add_per_element) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_input_layout(p), -10, 10)), + eltwise("eltwise", {"input", "input2"}, p.mode, p.default_type), + eltwise("add", {"eltwise", "add_data"}, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32)); + + implementation_desc eltw_impl = { format::fs_b_yx_fsv32, "eltwise_fs_b_yx_fsv32" }; + bo_fused.set_option(build_option::force_implementations({ {"eltwise", eltw_impl} })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P(fusings_gpu, + eltwise_fp32_fsv32, + ::testing::ValuesIn(std::vector{ + // There's no optimized eltwise kernel yet for fsv32 layout that supports fused_ops + // So only activation is fused via legacy mechanism + eltwise_test_params{CASE_ELTWISE_FP16_4, 4, 5}, + }), ); + class eltwise_fp32_fused_prims : public EltwiseFusingTest {}; TEST_P(eltwise_fp32_fused_prims, scale_activation) { auto p = GetParam();