diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp index d42f1c112df..5362f7ce54c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp @@ -35,8 +35,7 @@ void basic_memory_dependencies::run(program& p) { add_memory_dependency(it.first, node); } - if (node->get_preferred_impl_type() == impl_types::onednn - && (node->is_type() || node->is_type())) { + if (node->get_preferred_impl_type() == impl_types::onednn) { size_t eltw_dep = 0; for (auto& fused_op : node->get_fused_primitives()) { if (fused_op.is_type() && fused_op.deps.size() == 1) { diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 8ef109b5510..04c9783e6f6 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -466,6 +466,12 @@ network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, st prim_inst->set_output_memory(new_mem); } + for (auto p_inst : _exec_order) { + if (p_inst->can_be_optimized() && !p_inst->is_dynamic()) { + p_inst->update_output_memory(); + } + } + size_t num_variable_state_primitives; ib >> num_variable_state_primitives; for (size_t i = 0; i < num_variable_state_primitives; i++) { diff --git a/src/plugins/intel_gpu/tests/fusions/deconvolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/deconvolution_fusion_test.cpp index 0545636b15a..30fe5cf7dc5 100644 --- a/src/plugins/intel_gpu/tests/fusions/deconvolution_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/deconvolution_fusion_test.cpp @@ -58,26 +58,26 @@ struct deconv_eltw_test_params { class DeconvolutionFusingTest : public ::BaseFusingTest { public: - void execute(deconv_test_params& p) { - execute(p, get_mem(get_input_layout(p))); + void execute(deconv_test_params& p, bool is_caching_test = false) { + execute(p, get_mem(get_input_layout(p)), is_caching_test); } - void execute(deconv_test_params& p, cldnn::memory::ptr input_prim) { + void execute(deconv_test_params& p, cldnn::memory::ptr input_prim, bool is_caching_test = false) { if (engine.get_device_info().supports_immad) p.expected_fused_primitives = p.expected_fused_primitives_onednn; - network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused); - network network_fused(this->engine, this->topology_fused, cfg_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); + network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(cfg_not_fused), is_caching_test); + network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(cfg_fused), is_caching_test); + network_fused->set_input_data("input", input_prim); + network_not_fused->set_input_data("input", input_prim); - compare(network_not_fused, network_fused, p); + compare(*network_not_fused, *network_fused, p); auto find_conv = [](primitive_info& p) -> bool { if (p.original_id == "deconv") return true; return false; }; - auto pi_fused = network_fused.get_primitives_info(); + auto pi_fused = network_fused->get_primitives_info(); auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); if (info_fused != pi_fused.end()) std::cout << "kernel: " << info_fused->kernel_id << std::endl; @@ -443,22 +443,32 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale, ::testing::ValuesIn(std::vec deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 2, 3 }, })); -class deconv_actv_eltw_actv : public DeconvolutionFusingTest {}; +class deconv_actv_eltw_actv : public DeconvolutionFusingTest { +public: + void run_test(bool is_caching_test = false) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("eltw_data", get_mem(get_output_layout(p))), + deconvolution("deconv", input_info("input"), { "weights" }, p.groups, p.stride, p.pad), + activation("act1", input_info("deconv"), activation_func::relu), + eltwise("eltw", { input_info("act1"), input_info("eltw_data") }, eltwise_mode::sum), + activation("act2", input_info("eltw"), activation_func::relu), + reorder("out", input_info("act2"), p.default_format, data_types::f32) + ); + // Need much higher tolerance because of deconvolution -> convolution optimization + tolerance = 1.f; + execute(p, is_caching_test); + } +}; + TEST_P(deconv_actv_eltw_actv, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("eltw_data", get_mem(get_output_layout(p))), - deconvolution("deconv", input_info("input"), { "weights" }, p.groups, p.stride, p.pad), - activation("act1", input_info("deconv"), activation_func::relu), - eltwise("eltw", { input_info("act1"), input_info("eltw_data") }, eltwise_mode::sum), - activation("act2", input_info("eltw"), activation_func::relu), - reorder("out", input_info("act2"), p.default_format, data_types::f32) - ); - // Need much higher tolerance because of deconvolution -> convolution optimization - tolerance = 1.f; - execute(p); + run_test(); +} + +TEST_P(deconv_actv_eltw_actv, basic_cached) { + run_test(true); } INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_actv_eltw_actv, ::testing::ValuesIn(std::vector{ diff --git a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp index 115f09c24b7..c943626e53a 100644 --- a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp @@ -94,8 +94,8 @@ public: cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim", fc_impl } })); cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); - network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(), is_caching_test); - network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(), is_caching_test); + network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(cfg_not_fused), is_caching_test); + network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(cfg_fused), is_caching_test); network_fused->set_input_data("input", input_prim); network_not_fused->set_input_data("input", input_prim); @@ -411,43 +411,34 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_eltwise_activation_quantize_i8, :: #ifdef ENABLE_ONEDNN_FOR_GPU // FC onednn sum case -class fc_int8_inputs_fused_fp32_sum : public FullyConnectedFusingTestOneDNN {}; +class fc_int8_inputs_fused_fp32_sum : public FullyConnectedFusingTestOneDNN { +public: + void run_test(bool is_caching_test = false) { + auto p = GetParam(); + auto shift_layout = layout{ ov::PartialShape{p.weights_shape[0]}, p.default_type, p.default_format }; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("shift_data", get_mem(shift_layout, 1)), + fully_connected("fc_prim", input_info("input"), "weights", "bias", cldnn::data_types::f32, padding(), get_output_dim_size(p), get_input_weights_rank(p)), + eltwise("shift", { input_info("fc_prim"), input_info("shift_data") }, eltwise_mode::sum, cldnn::data_types::f32), + crop("crop", input_info("shift"), get_output_layout(p).get_tensor(), { 0, 0, 0, 0 }), + reorder("reorder_bfyx", input_info("crop"), p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p, is_caching_test); + } +}; + TEST_P(fc_int8_inputs_fused_fp32_sum, basic) { - auto p = GetParam(); - auto shift_layout = layout{ ov::PartialShape{p.weights_shape[0]}, p.default_type, p.default_format }; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("shift_data", get_mem(shift_layout, 1)), - fully_connected("fc_prim", input_info("input"), "weights", "bias", cldnn::data_types::f32, padding(), get_output_dim_size(p), get_input_weights_rank(p)), - eltwise("shift", { input_info("fc_prim"), input_info("shift_data") }, eltwise_mode::sum, cldnn::data_types::f32), - crop("crop", input_info("shift"), get_output_layout(p).get_tensor(), { 0, 0, 0, 0 }), - reorder("reorder_bfyx", input_info("crop"), p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); + run_test(false); } TEST_P(fc_int8_inputs_fused_fp32_sum, basic_cached) { - auto p = GetParam(); - auto shift_layout = layout{ ov::PartialShape{p.weights_shape[0]}, p.default_type, p.default_format }; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("shift_data", get_mem(shift_layout, 1)), - fully_connected("fc_prim", input_info("input"), "weights", "bias", cldnn::data_types::f32, padding(), get_output_dim_size(p), get_input_weights_rank(p)), - eltwise("shift", { input_info("fc_prim"), input_info("shift_data") }, eltwise_mode::sum, cldnn::data_types::f32), - crop("crop", input_info("shift"), get_output_layout(p).get_tensor(), { 0, 0, 0, 0 }), - reorder("reorder_bfyx", input_info("crop"), p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p, true); + run_test(true); } INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing::ValuesIn(std::vector{ @@ -458,37 +449,31 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing:: })); -class fc_fp16_eltwise_add : public FullyConnectedFusingTestOneDNN {}; -TEST_P(fc_fp16_eltwise_add, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), - fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), - eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum), - reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) - ); +class fc_fp16_eltwise_add : public FullyConnectedFusingTestOneDNN { +public: + void run_test(bool is_caching_test = false) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), + fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), + eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum), + reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) + ); - tolerance = 1e-2f; - execute(p); + tolerance = 1e-2f; + execute(p, is_caching_test); + } +}; + +TEST_P(fc_fp16_eltwise_add, basic) { + run_test(false); } TEST_P(fc_fp16_eltwise_add, basic_cached) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), - fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), - eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum), - reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) - ); - - tolerance = 1e-2f; - execute(p, true); + run_test(true); } INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_add, ::testing::ValuesIn(std::vector{ @@ -527,37 +512,31 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, fc_fp16_eltwise_add_dynamic, ::te fully_connected_test_params{ CASE_FC_FP16_4, 2, 3 }, })); -class fc_fp16_eltwise_sub : public FullyConnectedFusingTestOneDNN {}; -TEST_P(fc_fp16_eltwise_sub, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), - fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), - eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sub), - reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) - ); +class fc_fp16_eltwise_sub : public FullyConnectedFusingTestOneDNN { +public: + void run_test(bool is_caching_test = false) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), + fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), + eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sub), + reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) + ); - tolerance = 1e-1f; - execute(p); + tolerance = 1e-1f; + execute(p, is_caching_test); + } +}; + +TEST_P(fc_fp16_eltwise_sub, basic) { + run_test(false); } TEST_P(fc_fp16_eltwise_sub, basic_cached) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), - fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), - eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sub), - reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) - ); - - tolerance = 1e-1f; - execute(p, true); + run_test(true); } INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_sub, ::testing::ValuesIn(std::vector{ @@ -568,37 +547,31 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_sub, ::testing::ValuesIn(s fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3 }, })); -class fc_fp16_eltwise_prod : public FullyConnectedFusingTestOneDNN {}; -TEST_P(fc_fp16_eltwise_prod, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), - fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), - eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::prod), - reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) - ); +class fc_fp16_eltwise_prod : public FullyConnectedFusingTestOneDNN { +public: + void run_test(bool is_caching_test = false) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), + fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), + eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::prod), + reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) + ); - tolerance = 1e-1f; - execute(p); + tolerance = 1e-1f; + execute(p, is_caching_test); + } +}; + +TEST_P(fc_fp16_eltwise_prod, basic) { + run_test(false); } TEST_P(fc_fp16_eltwise_prod, basic_cached) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), - fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), - eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::prod), - reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) - ); - - tolerance = 1e-1f; - execute(p, true); + run_test(true); } INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_prod, ::testing::ValuesIn(std::vector{ @@ -609,5 +582,39 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_prod, ::testing::ValuesIn( fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3 }, })); +class fc_fp16_eltwise_sum : public FullyConnectedFusingTestOneDNN { +public: + void run_test(bool is_caching_test = false) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), + eltwise("sum", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum), + reorder("reorder_bfyx", input_info("sum"), p.default_format, data_types::f32) + ); + + tolerance = 1e-1f; + execute(p, is_caching_test); + } +}; + +TEST_P(fc_fp16_eltwise_sum, basic) { + run_test(false); +} + +TEST_P(fc_fp16_eltwise_sum, basic_cached) { + run_test(true); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_sum, ::testing::ValuesIn(std::vector{ + fully_connected_test_params{ CASE_FC_FP16_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP16_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP16_3, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP16_3D_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3 }, +})); #endif