[GPU] Support FC+eltwise fusion in fp16 for OneDNN (#16303)

* [GPU] Support FC+eltwise fusion in fp16

Signed-off-by: hyunback <hyunback.kim@intel.com>
This commit is contained in:
hyunback kim 2023-03-28 14:49:49 +09:00 committed by GitHub
parent 5dff012233
commit d06a22f4e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 138 additions and 7 deletions

View File

@ -558,10 +558,14 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
return false;
};
auto fc_supports_fusings = [](fully_connected_node& node) -> bool {
auto fc_supports_fusings = [&](fully_connected_node& node) -> bool {
if (_lo.get_optimization_attributes().use_onednn_impls &&
_lo.get_preferred_impl_type(node, format::any /*dummy*/) == impl_types::onednn) {
return true;
} else {
auto in_dt = node.get_dependency(0).get_output_layout().data_type;
return data_type_traits::is_i8_u8(in_dt);
}
};
auto gemm_supports_fusings = [](gemm_node& node) -> bool {

View File

@ -1008,6 +1008,29 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
const auto prim_id = "broadcast:" + data.id() + "_broadcasted" + std::to_string(idx++);
auto broadcast_prim = std::make_shared<cldnn::broadcast>(prim_id, cldnn::input_info(data.id()), gemm_layout.get_shape(), ov::AxisSet{});
auto& broadcast_node = p.get_or_create(broadcast_prim);
p.add_intermediate(broadcast_node, *node, fused_prim.dep_start_idx, true);
broadcast_node.recalc_output_layouts(false);
}
}
} else if (node->is_type<fully_connected>() && node->get_preferred_impl_type() == impl_types::onednn) {
for (const auto& fused_prim : node->get_fused_primitives()) {
if (fused_prim.is_type<eltwise>() &&
one_of(fused_prim.typed_desc<eltwise>()->mode, {eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod})) {
auto fc_layout = node->get_output_layout();
auto& data = node->get_dependency(fused_prim.dep_start_idx);
auto data_layout = data.get_output_layout();
if ((fc_layout.batch() == 1 || fc_layout.feature() == 1) ||
(data_layout.batch() == 1 && data_layout.feature() == 1) ||
(fc_layout.count() == data_layout.count())) {
continue;
}
static size_t idx = 0;
const auto prim_id = "broadcast:" + data.id() + "_broadcasted" + std::to_string(idx++);
auto broadcast_prim = std::make_shared<cldnn::broadcast>(prim_id, cldnn::input_info(data.id()), fc_layout.get_shape(), ov::AxisSet{});
auto& broadcast_node = p.get_or_create(broadcast_prim);
p.add_intermediate(broadcast_node, *node, fused_prim.dep_start_idx, true);
broadcast_node.recalc_output_layouts(false);

View File

@ -30,6 +30,7 @@ struct fully_connected_test_params {
format default_format;
size_t expected_fused_primitives;
size_t expected_not_fused_primitives;
std::string ocl_kernel_name; // for onednn test
};
class FullyConnectedFusingTest : public ::BaseFusingTest<fully_connected_test_params> {
@ -85,14 +86,23 @@ public:
auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p));
auto impl_forcing = cfg_fused.get_property(ov::intel_gpu::force_implementations);
auto forcing_format = p.input_format;
for (auto& forcing : impl_forcing)
if (forcing.first == "fc_prim")
forcing_format = forcing.second.output_format;
ov::intel_gpu::ImplementationDesc conv_impl = { forcing_format, "", impl_types::onednn };
cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim", conv_impl } }));
ov::intel_gpu::ImplementationDesc fc_impl = { forcing_format, "", impl_types::onednn };
cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim", fc_impl } }));
if (!p.ocl_kernel_name.empty()) {
auto ocl_impl_forcing = cfg_not_fused.get_property(ov::intel_gpu::force_implementations);
auto ocl_forcing_format = p.input_format;
for (auto& forcing : ocl_impl_forcing)
if (forcing.first == "fc_prim")
ocl_forcing_format = forcing.second.output_format;
ov::intel_gpu::ImplementationDesc fc_ocl_impl = { ocl_forcing_format, p.ocl_kernel_name /*fully_connected_gpu_bfyx_ref*/};
cfg_not_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim", fc_ocl_impl } }));
}
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
network network_fused(this->engine, this->topology_fused, cfg_fused);
network_fused.set_input_data("input", input_prim);
@ -154,6 +164,16 @@ public:
#define CASE_FC_U8S8_3D_3 { 2, 3, 1 }, { 2, 3, 15 }, { 15, 1, 1 }, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_U8S8_3D_4 { 1, 512, 1024 }, { 1, 384, 1024 }, { 1024, 1024, 1 }, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_1 { 1, 3 }, { 1, 4 }, { 4, 3 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_2 { 2, 3 }, { 2, 4 }, { 4, 3 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_3 { 2, 32 }, { 2, 16 }, { 16, 32 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_4 { 128, 76 }, { 128, 768 }, { 768, 76 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_5 { 1, 128, 76 }, { 1, 128, 768 }, { 1, 768, 76 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_6 { 2, 1, 76 }, { 2, 1, 768 }, { 768, 76, 1 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_7 { 2, 128, 76 }, { 2, 128, 768 }, { 768, 76, 1 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_3D_1 { 2, 32, 3 }, { 2, 32, 16 }, { 16, 3, 1 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_3D_2 { 1, 1, 3 }, { 1, 1, 32 }, { 32, 3, 1 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
/* ----------------------------------------------------------------------------------------------------- */
/* ---------------------------------------- FC cases --------------------------------------------------- */
/* ----------------------------------------------------------------------------------------------------- */
@ -429,4 +449,88 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing::
// fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 4 },
fully_connected_test_params{ CASE_FC_U8S8_3D_4, 2, 4 },
}));
class fc_fp16_eltwise_add : public FullyConnectedFusingTestOneDNN {};
TEST_P(fc_fp16_eltwise_add, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-2f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_add, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
// fully_connected_test_params{ CASE_FC_FP16_1, 2, 3, "fully_connected_gpu_bs_f_bsv16_b1"}, // TODO check a failure in fully_connected_gpu_bs_f_bsv16_b1 + eltwise in iGPU
// fully_connected_test_params{ CASE_FC_FP16_3D_3, 2, 3, "fully_connected_gpu_bfyx_ref"}, // TODO check onednn failure
fully_connected_test_params{ CASE_FC_FP16_1, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_2, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_4, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_5, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_6, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_7, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3D_1, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3, "fully_connected_gpu_bfyx_ref" },
}));
class fc_fp16_eltwise_sub : public FullyConnectedFusingTestOneDNN {};
TEST_P(fc_fp16_eltwise_sub, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sub),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-1f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_sub, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
fully_connected_test_params{ CASE_FC_FP16_1, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_2, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3D_1, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3, "fully_connected_gpu_bfyx_ref" },
}));
class fc_fp16_eltwise_prod : public FullyConnectedFusingTestOneDNN {};
TEST_P(fc_fp16_eltwise_prod, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::prod),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-1f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_prod, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
fully_connected_test_params{ CASE_FC_FP16_1, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_2, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3D_1, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3, "fully_connected_gpu_bfyx_ref" },
}));
#endif