diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp index 65683f10c68..ab69ea235de 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp @@ -149,6 +149,11 @@ void handle_reshape::run(program& p) { auto new_reshape = std::make_shared("reorder:_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape); + new_reshape->special_zero = prim->special_zero; + new_reshape->output_partial_shape = prim->output_partial_shape; + new_reshape->output_pattern = prim->output_pattern; + new_reshape->mode = prim->mode; + new_reshape->input = prim->input; auto& new_reshape_node = p.get_or_create(new_reshape); user->replace_dependency(0, input_node); p.add_intermediate(new_reshape_node, *user, 0); diff --git a/src/plugins/intel_gpu/src/graph/include/to_string_utils.h b/src/plugins/intel_gpu/src/graph/include/to_string_utils.h index 76cf9bc686f..fb135b06d86 100644 --- a/src/plugins/intel_gpu/src/graph/include/to_string_utils.h +++ b/src/plugins/intel_gpu/src/graph/include/to_string_utils.h @@ -23,16 +23,6 @@ namespace cldnn { inline std::string bool_to_str(bool cond) { return cond ? "true" : "false"; } -inline std::string get_extr_type(const std::string& str) { - auto begin = str.find('<'); - auto end = str.find('>'); - - if (begin == std::string::npos || end == std::string::npos) - return {}; - - return str.substr(begin + 1, (end - begin) - 1); -} - inline std::string dt_to_str(data_types dt) { return data_type_traits::name(dt); } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index adf8491870a..93eaf28b4f6 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -846,7 +846,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, allocation_type type, bool reusable, bool reset = true) { OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound"); // Use layout with max tensor for dynamic shape with upper bound - auto static_layout = cldnn::layout(layout.data_type, layout.format, layout.get_tensor(), layout.data_padding); + auto static_layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding); if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable, reset); return pool.get_memory(static_layout, type, reset); diff --git a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp index 4f3c958cfeb..ea02db87a66 100644 --- a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp @@ -6,6 +6,7 @@ #include "to_string_utils.h" #include "data_inst.h" #include "condition_inst.h" +#include "data_inst.h" #include "json_object.h" #include @@ -170,40 +171,18 @@ void dump_graph_init(std::ofstream& graph, const program& program, std::function const& filter) { const std::string invalid_layout_msg = "(invalid layout)"; - const auto extr_oformat = [&invalid_layout_msg](const program_node* ptr) { - if (!ptr->is_valid_output_layout()) - return invalid_layout_msg; - - auto output_layout = ptr->get_output_layout(); - std::string out = output_layout.format.to_string(); - - return out; - }; - - const auto extr_odt = [&invalid_layout_msg](const program_node* ptr) { - if (!ptr->is_valid_output_layout()) - return invalid_layout_msg; - - auto output_layout = ptr->get_output_layout(); - std::string out = dt_to_str(output_layout.data_type); - - return out; - }; const auto dump_mem_info = [&invalid_layout_msg](const program_node* ptr) { - std::string out = "size_info: "; + std::string out = "layout_info: "; if (!ptr->is_valid_output_layout()) { return out + invalid_layout_msg; } auto out_layout = ptr->get_output_layout(); - auto tensor_str = out_layout.to_string(); - auto padding = out_layout.data_padding; - out += tensor_str; - if (!padding) { - out += " (nonpadded)"; + if (!out_layout.data_padding) { + out += " " + out_layout.to_short_string(); } else { - out += "\nl: " + padding.lower_size().to_string() + "\nu: " + padding.upper_size().to_string(); + out += " " + out_layout.to_string(); } return out; @@ -218,23 +197,20 @@ void dump_graph_init(std::ofstream& graph, #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wpotentially-evaluated-expression" #endif - auto& node_type = typeid(*node); - std::string node_type_name = get_extr_type(node_type.name()); - graph << " " << get_node_id(node) << "[label=\"" << node->id() << ":\n" - << node_type_name << "\n out format: " + extr_oformat(node) - << "\n out data_type: " + extr_odt(node) + std::string node_type_name = node->get_primitive()->type_string(); + graph << " " << get_node_id(node) << "[label=\"" << node->id() << ":" + << "\\ntype: " << node_type_name << "\\nprocessing number: " << program.get_processing_order().get_processing_number(node) << "\\n color:" << (node->is_reusing_memory() ? std::to_string(node->get_reused_memory_color()) : "none") << (node->can_be_optimized() ? "\\n optimized out" : ""); - if (node_type_name != "struct cldnn::data" && node_type_name != "struct cldnn::input_layout" && - !node->can_be_optimized()) { + if (!node->is_type()) { graph << "\\n Selected kernel: " << (node->get_selected_impl() == nullptr ? "none" : node->get_selected_impl()->get_kernel_name()) + " / " - << node->get_preferred_impl_type() - << "\n" + dump_mem_info(node); + << node->get_preferred_impl_type(); } + graph << "\n" + dump_mem_info(node); graph << "\""; #ifdef __clang__ #pragma clang diagnostic pop diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp index d503efaf540..edfe036d25f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp @@ -30,7 +30,15 @@ JitConstants ReduceKernelBase::GetJitConstants(const reduce_params& params) cons const auto& output = params.outputs[0]; if (output.is_dynamic()) { - size_t output_tensor_offset = 1 + GetFusedPrimitiveInputsCount(params); + size_t output_tensor_offset = params.inputs[0].is_dynamic() ? 1 : 0; + for (size_t i = 0; i < params.fused_ops.size(); i++) { + auto& fused_op_inputs = params.fused_ops[i].tensors; + + for (auto& t : fused_op_inputs) { + if (t.is_dynamic()) + output_tensor_offset++; + } + } DimensionAccessHelper dims(output, output_tensor_offset); jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", toVectorMulString({dims.x, dims.y, diff --git a/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp index 736392016e2..2a7d75c107c 100644 --- a/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp @@ -18,8 +18,8 @@ using namespace ::tests; namespace { struct reduce_test_params { - cldnn::tensor in_shape; - cldnn::tensor out_shape; + ov::PartialShape in_shape; + ov::PartialShape out_shape; cldnn::data_types data_type; cldnn::format input_format; data_types default_type; @@ -34,9 +34,12 @@ struct reduce_test_params { class ReduceFusingTest : public ::BaseFusingTest { public: - void execute(reduce_test_params& p) { + void execute(reduce_test_params& p, bool is_dynamic = false) { auto input_prim = get_mem(get_input_layout(p)); + cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); + cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); + network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused); network network_fused(this->engine, this->topology_fused, cfg_fused); @@ -52,70 +55,60 @@ public: if (axis >= static_cast(rank)) throw std::runtime_error("Unsupported reduce test case"); - switch (axis) { - case 0: // batch - p.out_shape.batch[0] = 1; - break; - case 1: // feature - p.out_shape.feature[0] = 1; - break; - case 2: - p.out_shape.spatial[rank - 3] = 1; - break; - case 3: - p.out_shape.spatial[rank - 4] = 1; - break; - case 4: - p.out_shape.spatial[rank - 5] = 1; - break; - case 5: - p.out_shape.spatial[rank - 6] = 1; - break; - } + p.out_shape[axis] = 1; } } + layout get_dynamic_input_layout(reduce_test_params& p) { + return layout{ ov::PartialShape::dynamic(p.in_shape.size()), p.data_type, p.input_format }; + } + layout get_input_layout(reduce_test_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape }; + return layout{ p.in_shape, p.data_type, p.input_format }; + } + + layout get_output_layout(reduce_test_params& p) { + return layout{ p.out_shape, p.data_type, p.input_format }; } layout get_per_channel_layout(reduce_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; + return layout{ {1, p.in_shape[1], 1, 1}, p.default_type, p.default_format }; } + }; } // namespace /* ----------------------------------------------------------------------------------------------------- */ /* ---------------------------------------- Reduce cases ----------------------------------------------- */ /* ----------------------------------------------------------------------------------------------------- */ -#define CASE_REDUCE_F32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_F32_1 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f32, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F32_3 { 16, 16, 16, 8, 8, 8 }, { 16, 16, 16, 8, 8, 8 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F32_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_F32_1 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F32_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F32_3 { 16, 16, 8, 8, 8, 16 }, { 16, 16, 8, 8, 8, 16 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx #define CASE_REDUCE_F32_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_F16_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_F16_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_REDUCE_F16_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F16_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f16, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F16_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F16_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::f16, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F16_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx #define CASE_REDUCE_F16_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_I32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_I32_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_REDUCE_I32_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i32, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i32, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I32_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I32_4 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I32_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::i32, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I32_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I32_4 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_I8_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_REDUCE_I8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i8, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I8_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::i8, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I8_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx #define CASE_REDUCE_I8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_U8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 },data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_U8_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 },data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_REDUCE_U8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_U8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::u8, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_U8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_U8_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::u8, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_U8_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx #define CASE_REDUCE_U8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx class reduce_eltwise_activation_quantize : public ReduceFusingTest {}; @@ -272,6 +265,24 @@ TEST_P(reduce_scale_activation, per_channel) { execute(p); } +TEST_P(reduce_scale_activation, dynamic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_dynamic_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), -0.125f)), + reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims), + eltwise("scale", { input_info("reduce"), input_info("scale_data") }, eltwise_mode::prod), + activation("activation", input_info("scale"), activation_func::cos), + reorder("output_reorder", input_info("activation"), p.default_format, data_types::f32) + ); + // Activation won't be fused because onednn doesn't support cos activation + if (engine.get_device_info().supports_immad) + p.expected_fused_primitives++; + + tolerance = 1e-02f; + execute(p, true); +} + INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_scale_activation, ::testing::ValuesIn(std::vector{ reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::max, { 3, 2, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }, reduce_test_params{ CASE_REDUCE_F32_1, 2, 4, reduce_mode::sum, { 3, 2, 0 }, true, "reduce_ref" }, diff --git a/src/plugins/intel_gpu/tests/passes/handle_reshape.cpp b/src/plugins/intel_gpu/tests/passes/handle_reshape.cpp index 2a56f713dee..60853848680 100644 --- a/src/plugins/intel_gpu/tests/passes/handle_reshape.cpp +++ b/src/plugins/intel_gpu/tests/passes/handle_reshape.cpp @@ -11,6 +11,8 @@ #include "data_inst.h" #include "eltwise_inst.h" #include "reshape_inst.h" +#include "reorder_inst.h" +#include "broadcast_inst.h" #include "pass_manager.h" #include "to_string_utils.h" @@ -90,3 +92,104 @@ TEST(handle_reshape, skip_reorder_node_to_split_when_onndnn_not_support) { ASSERT_TRUE(prog->get_node("matmul").get_dependency(0).get_output_layout().data_type == data_types::f16); } + +TEST(handle_reshape, correct_parameters_propagation) { + auto& engine = get_test_engine(); + auto data0_layout = engine.allocate_memory({ ov::PartialShape{}, data_types::f16, format::bfyx }); + auto data1_layout = engine.allocate_memory({ ov::PartialShape{1, 12}, data_types::f16, format::bfyx }); + auto in_layout = layout{ ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx }; + + topology topology; + topology.add(input_layout("input", in_layout)); + topology.add(data("data0", data0_layout)); + topology.add(data("data1", data1_layout)); + topology.add(eltwise("e1", input_info("input"), input_info("data0"), eltwise_mode::sum)); + topology.add(reshape("reshape", input_info("e1"), false, {2, 12}, {2, 12})); + topology.add(eltwise("e2", input_info("reshape"), input_info("data1"), eltwise_mode::sum)); + topology.add(reorder("reorder", input_info("reshape"), format::bfyx, data_types::f32)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config, false, true); + + layout_optimizer lo(true); + + program_wrapper::apply_opt_pass(*prog); + + ASSERT_NE(prog, nullptr); + ASSERT_TRUE(has_node_with_type(*prog)); + + ASSERT_TRUE(prog->get_node("reshape").can_be_optimized()); + + auto out_shape0 = prog->get_node("e2").get_output_layout().get_partial_shape(); + auto out_shape1 = prog->get_node("reorder").get_output_layout().get_partial_shape(); + + ov::PartialShape expected_out_shape{2, 12}; + + // handle_reshape may do reshape split, so ensure that output shape on all branches is correct + ASSERT_EQ(out_shape0, expected_out_shape); + ASSERT_EQ(out_shape1, expected_out_shape); +} + +TEST(handle_reshape, reshape_input_reorder) { + auto& engine = get_test_engine(); + auto shape_memory = engine.allocate_memory({ ov::PartialShape{5}, data_types::i32, format::bfyx }); + auto in0_layout = layout{ ov::PartialShape{1, -1, 16, 64, 64}, data_types::f16, format::bfzyx }; + auto in0_memory = engine.allocate_memory(layout{ ov::PartialShape{1, 2, 16, 64, 64}, data_types::f16, format::bfzyx }); + auto in1_layout = layout{ ov::PartialShape{-1, 16, 64, 64}, data_types::f16, format::bfyx }; + auto in1_memory = engine.allocate_memory({ ov::PartialShape{2, 16, 64, 64}, data_types::f16, format::bfyx }); + + auto in0 = generate_random_1d(in0_memory->count(), -10, 10); + auto in1 = generate_random_1d(in1_memory->count(), -10, 10); + set_values(in0_memory, in0); + set_values(shape_memory, {1, 2, 16, 64, 64}); + set_values(in1_memory, in1); + + topology topology; + topology.add(input_layout("input0", in0_layout)); + topology.add(input_layout("target_shape", shape_memory->get_layout())); + topology.add(broadcast("broadcast", input_info("input0"), input_info("target_shape"), {}, ov::op::BroadcastType::BIDIRECTIONAL)); + topology.add(reshape("reshape", input_info("broadcast"), true, {-1, 16, 64, 64}, {-1, 16, 64, 64})); + topology.add(input_layout("input1", in1_layout)); + topology.add(eltwise("eltw", input_info("reshape"), input_info("input1"), eltwise_mode::sum)); + topology.add(reorder("reorder", input_info("eltw"), format::bfyx, data_types::f32)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config); + + ASSERT_NE(prog, nullptr); + ASSERT_TRUE(has_node_with_type(*prog)); + + ASSERT_TRUE(prog->get_node("reshape").can_be_optimized()); + auto reshape_layout_in = prog->get_node("reshape").get_input_layouts()[0]; + auto reshape_layout_out = prog->get_node("reshape").get_output_layout(); + + // At this moment transfomations insert reorder before reshape which + // converts tensor to default format with rank = reshape_out_rank + // Likely in the future we'll update that reorder so it will use reshape_input_rank + // After that expected in format will be bfzyx + ASSERT_EQ(reshape_layout_in.format, format::bfyx); + ASSERT_EQ(reshape_layout_out.format, format::bfyx); + + ov::PartialShape expected_out_shape{-1, 16, 64, 64}; + ASSERT_EQ(reshape_layout_out.get_partial_shape(), expected_out_shape); + + network net(prog); + + net.set_input_data("input0", in0_memory); + net.set_input_data("input1", in1_memory); + net.set_input_data("target_shape", shape_memory); + auto output = net.execute(); + + auto out_mem = output.at("reorder").get_memory(); + mem_lock lock(out_mem, get_test_stream()); + + for (size_t i = 0; i < out_mem->count(); i++) { + float expected = static_cast(in0[i]) + static_cast(in1[i]); + float actual = lock[i]; + ASSERT_EQ(expected, actual) << " i = " << i; + } +}