[GPU] Fixed OneDNN fc+sum fusion serialization (#16988)

* fixed onednn fc+sum fusion serialization

* removed the white list for sum post op fusion

* added deconv fusing caching tests
This commit is contained in:
Eddy Kim
2023-04-20 01:43:27 +09:00
committed by GitHub
parent 4c3a4a8992
commit fab8236af3
4 changed files with 164 additions and 142 deletions

View File

@@ -35,8 +35,7 @@ void basic_memory_dependencies::run(program& p) {
add_memory_dependency(it.first, node);
}
if (node->get_preferred_impl_type() == impl_types::onednn
&& (node->is_type<convolution>() || node->is_type<deconvolution>())) {
if (node->get_preferred_impl_type() == impl_types::onednn) {
size_t eltw_dep = 0;
for (auto& fused_op : node->get_fused_primitives()) {
if (fused_op.is_type<eltwise>() && fused_op.deps.size() == 1) {

View File

@@ -466,6 +466,12 @@ network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, st
prim_inst->set_output_memory(new_mem);
}
for (auto p_inst : _exec_order) {
if (p_inst->can_be_optimized() && !p_inst->is_dynamic()) {
p_inst->update_output_memory();
}
}
size_t num_variable_state_primitives;
ib >> num_variable_state_primitives;
for (size_t i = 0; i < num_variable_state_primitives; i++) {

View File

@@ -58,26 +58,26 @@ struct deconv_eltw_test_params {
class DeconvolutionFusingTest : public ::BaseFusingTest<deconv_test_params> {
public:
void execute(deconv_test_params& p) {
execute(p, get_mem(get_input_layout(p)));
void execute(deconv_test_params& p, bool is_caching_test = false) {
execute(p, get_mem(get_input_layout(p)), is_caching_test);
}
void execute(deconv_test_params& p, cldnn::memory::ptr input_prim) {
void execute(deconv_test_params& p, cldnn::memory::ptr input_prim, bool is_caching_test = false) {
if (engine.get_device_info().supports_immad)
p.expected_fused_primitives = p.expected_fused_primitives_onednn;
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
network network_fused(this->engine, this->topology_fused, cfg_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(cfg_not_fused), is_caching_test);
network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(cfg_fused), is_caching_test);
network_fused->set_input_data("input", input_prim);
network_not_fused->set_input_data("input", input_prim);
compare(network_not_fused, network_fused, p);
compare(*network_not_fused, *network_fused, p);
auto find_conv = [](primitive_info& p) -> bool {
if (p.original_id == "deconv")
return true;
return false;
};
auto pi_fused = network_fused.get_primitives_info();
auto pi_fused = network_fused->get_primitives_info();
auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv);
if (info_fused != pi_fused.end())
std::cout << "kernel: " << info_fused->kernel_id << std::endl;
@@ -443,22 +443,32 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale, ::testing::ValuesIn(std::vec
deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 2, 3 },
}));
class deconv_actv_eltw_actv : public DeconvolutionFusingTest {};
class deconv_actv_eltw_actv : public DeconvolutionFusingTest {
public:
void run_test(bool is_caching_test = false) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("eltw_data", get_mem(get_output_layout(p))),
deconvolution("deconv", input_info("input"), { "weights" }, p.groups, p.stride, p.pad),
activation("act1", input_info("deconv"), activation_func::relu),
eltwise("eltw", { input_info("act1"), input_info("eltw_data") }, eltwise_mode::sum),
activation("act2", input_info("eltw"), activation_func::relu),
reorder("out", input_info("act2"), p.default_format, data_types::f32)
);
// Need much higher tolerance because of deconvolution -> convolution optimization
tolerance = 1.f;
execute(p, is_caching_test);
}
};
TEST_P(deconv_actv_eltw_actv, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("eltw_data", get_mem(get_output_layout(p))),
deconvolution("deconv", input_info("input"), { "weights" }, p.groups, p.stride, p.pad),
activation("act1", input_info("deconv"), activation_func::relu),
eltwise("eltw", { input_info("act1"), input_info("eltw_data") }, eltwise_mode::sum),
activation("act2", input_info("eltw"), activation_func::relu),
reorder("out", input_info("act2"), p.default_format, data_types::f32)
);
// Need much higher tolerance because of deconvolution -> convolution optimization
tolerance = 1.f;
execute(p);
run_test();
}
TEST_P(deconv_actv_eltw_actv, basic_cached) {
run_test(true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_actv_eltw_actv, ::testing::ValuesIn(std::vector<deconv_test_params>{

View File

@@ -94,8 +94,8 @@ public:
cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim", fc_impl } }));
cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(), is_caching_test);
network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(), is_caching_test);
network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(cfg_not_fused), is_caching_test);
network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(cfg_fused), is_caching_test);
network_fused->set_input_data("input", input_prim);
network_not_fused->set_input_data("input", input_prim);
@@ -411,43 +411,34 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_eltwise_activation_quantize_i8, ::
#ifdef ENABLE_ONEDNN_FOR_GPU
// FC onednn sum case
class fc_int8_inputs_fused_fp32_sum : public FullyConnectedFusingTestOneDNN {};
class fc_int8_inputs_fused_fp32_sum : public FullyConnectedFusingTestOneDNN {
public:
void run_test(bool is_caching_test = false) {
auto p = GetParam();
auto shift_layout = layout{ ov::PartialShape{p.weights_shape[0]}, p.default_type, p.default_format };
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("shift_data", get_mem(shift_layout, 1)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", cldnn::data_types::f32, padding(), get_output_dim_size(p), get_input_weights_rank(p)),
eltwise("shift", { input_info("fc_prim"), input_info("shift_data") }, eltwise_mode::sum, cldnn::data_types::f32),
crop("crop", input_info("shift"), get_output_layout(p).get_tensor(), { 0, 0, 0, 0 }),
reorder("reorder_bfyx", input_info("crop"), p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p, is_caching_test);
}
};
TEST_P(fc_int8_inputs_fused_fp32_sum, basic) {
auto p = GetParam();
auto shift_layout = layout{ ov::PartialShape{p.weights_shape[0]}, p.default_type, p.default_format };
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("shift_data", get_mem(shift_layout, 1)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", cldnn::data_types::f32, padding(), get_output_dim_size(p), get_input_weights_rank(p)),
eltwise("shift", { input_info("fc_prim"), input_info("shift_data") }, eltwise_mode::sum, cldnn::data_types::f32),
crop("crop", input_info("shift"), get_output_layout(p).get_tensor(), { 0, 0, 0, 0 }),
reorder("reorder_bfyx", input_info("crop"), p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
run_test(false);
}
TEST_P(fc_int8_inputs_fused_fp32_sum, basic_cached) {
auto p = GetParam();
auto shift_layout = layout{ ov::PartialShape{p.weights_shape[0]}, p.default_type, p.default_format };
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("shift_data", get_mem(shift_layout, 1)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", cldnn::data_types::f32, padding(), get_output_dim_size(p), get_input_weights_rank(p)),
eltwise("shift", { input_info("fc_prim"), input_info("shift_data") }, eltwise_mode::sum, cldnn::data_types::f32),
crop("crop", input_info("shift"), get_output_layout(p).get_tensor(), { 0, 0, 0, 0 }),
reorder("reorder_bfyx", input_info("crop"), p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p, true);
run_test(true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
@@ -458,37 +449,31 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing::
}));
class fc_fp16_eltwise_add : public FullyConnectedFusingTestOneDNN {};
TEST_P(fc_fp16_eltwise_add, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
class fc_fp16_eltwise_add : public FullyConnectedFusingTestOneDNN {
public:
void run_test(bool is_caching_test = false) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-2f;
execute(p);
tolerance = 1e-2f;
execute(p, is_caching_test);
}
};
TEST_P(fc_fp16_eltwise_add, basic) {
run_test(false);
}
TEST_P(fc_fp16_eltwise_add, basic_cached) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-2f;
execute(p, true);
run_test(true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_add, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
@@ -527,37 +512,31 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, fc_fp16_eltwise_add_dynamic, ::te
fully_connected_test_params{ CASE_FC_FP16_4, 2, 3 },
}));
class fc_fp16_eltwise_sub : public FullyConnectedFusingTestOneDNN {};
TEST_P(fc_fp16_eltwise_sub, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sub),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
class fc_fp16_eltwise_sub : public FullyConnectedFusingTestOneDNN {
public:
void run_test(bool is_caching_test = false) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sub),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-1f;
execute(p);
tolerance = 1e-1f;
execute(p, is_caching_test);
}
};
TEST_P(fc_fp16_eltwise_sub, basic) {
run_test(false);
}
TEST_P(fc_fp16_eltwise_sub, basic_cached) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sub),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-1f;
execute(p, true);
run_test(true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_sub, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
@@ -568,37 +547,31 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_sub, ::testing::ValuesIn(s
fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3 },
}));
class fc_fp16_eltwise_prod : public FullyConnectedFusingTestOneDNN {};
TEST_P(fc_fp16_eltwise_prod, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::prod),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
class fc_fp16_eltwise_prod : public FullyConnectedFusingTestOneDNN {
public:
void run_test(bool is_caching_test = false) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::prod),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-1f;
execute(p);
tolerance = 1e-1f;
execute(p, is_caching_test);
}
};
TEST_P(fc_fp16_eltwise_prod, basic) {
run_test(false);
}
TEST_P(fc_fp16_eltwise_prod, basic_cached) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::prod),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-1f;
execute(p, true);
run_test(true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_prod, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
@@ -609,5 +582,39 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_prod, ::testing::ValuesIn(
fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3 },
}));
class fc_fp16_eltwise_sum : public FullyConnectedFusingTestOneDNN {
public:
void run_test(bool is_caching_test = false) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_output_layout(p))),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("sum", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum),
reorder("reorder_bfyx", input_info("sum"), p.default_format, data_types::f32)
);
tolerance = 1e-1f;
execute(p, is_caching_test);
}
};
TEST_P(fc_fp16_eltwise_sum, basic) {
run_test(false);
}
TEST_P(fc_fp16_eltwise_sum, basic_cached) {
run_test(true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_sum, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
fully_connected_test_params{ CASE_FC_FP16_1, 2, 3 },
fully_connected_test_params{ CASE_FC_FP16_2, 2, 3 },
fully_connected_test_params{ CASE_FC_FP16_3, 2, 3 },
fully_connected_test_params{ CASE_FC_FP16_3D_1, 2, 3 },
fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3 },
}));
#endif