[GPU] Fix reshape split for dynamic models + accuracy fix for SAM (#16911)

2023-04-25 13:21:31 +04:00
parent 9247906879
commit f736c71feb
7 changed files with 182 additions and 89 deletions
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
@@ -149,6 +149,11 @@ void handle_reshape::run(program& p) {
                        auto new_reshape = std::make_shared<reshape>("reorder:_reshape_split_" + user->id() + "_" + node->id(),
                                                                     input_node.id(),
                                                                     output_shape);
+                        new_reshape->special_zero = prim->special_zero;
+                        new_reshape->output_partial_shape = prim->output_partial_shape;
+                        new_reshape->output_pattern = prim->output_pattern;
+                        new_reshape->mode = prim->mode;
+                        new_reshape->input = prim->input;
                        auto& new_reshape_node = p.get_or_create(new_reshape);
                        user->replace_dependency(0, input_node);
                        p.add_intermediate(new_reshape_node, *user, 0);
--- a/src/plugins/intel_gpu/src/graph/include/to_string_utils.h
+++ b/src/plugins/intel_gpu/src/graph/include/to_string_utils.h
@@ -23,16 +23,6 @@ namespace cldnn {

 inline std::string bool_to_str(bool cond) { return cond ? "true" : "false"; }

-inline std::string get_extr_type(const std::string& str) {
-    auto begin = str.find('<');
-    auto end = str.find('>');
-
-    if (begin == std::string::npos || end == std::string::npos)
-        return {};
-
-    return str.substr(begin + 1, (end - begin) - 1);
-}
-
 inline std::string dt_to_str(data_types dt) {
    return data_type_traits::name(dt);
 }
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -846,7 +846,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
            allocation_type type, bool reusable, bool reset = true) {
        OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
        // Use layout with max tensor for dynamic shape with upper bound
-        auto static_layout = cldnn::layout(layout.data_type, layout.format, layout.get_tensor(), layout.data_padding);
+        auto static_layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding);
        if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
            return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable, reset);
        return pool.get_memory(static_layout, type, reset);
--- a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp
@@ -6,6 +6,7 @@
 #include "to_string_utils.h"
 #include "data_inst.h"
 #include "condition_inst.h"
+#include "data_inst.h"
 #include "json_object.h"

 #include <algorithm>
@@ -170,40 +171,18 @@ void dump_graph_init(std::ofstream& graph,
                     const program& program,
                     std::function<bool(program_node const&)> const& filter) {
    const std::string invalid_layout_msg = "(invalid layout)";
-    const auto extr_oformat = [&invalid_layout_msg](const program_node* ptr) {
-        if (!ptr->is_valid_output_layout())
-            return invalid_layout_msg;
-
-        auto output_layout = ptr->get_output_layout();
-        std::string out = output_layout.format.to_string();
-
-        return out;
-    };
-
-    const auto extr_odt = [&invalid_layout_msg](const program_node* ptr) {
-        if (!ptr->is_valid_output_layout())
-            return invalid_layout_msg;
-
-        auto output_layout = ptr->get_output_layout();
-        std::string out = dt_to_str(output_layout.data_type);
-
-        return out;
-    };

    const auto dump_mem_info = [&invalid_layout_msg](const program_node* ptr) {
-        std::string out = "size_info: ";
+        std::string out = "layout_info: ";
        if (!ptr->is_valid_output_layout()) {
            return out + invalid_layout_msg;
        }

        auto out_layout = ptr->get_output_layout();
-        auto tensor_str = out_layout.to_string();
-        auto padding = out_layout.data_padding;
-        out += tensor_str;
-        if (!padding) {
-            out += " (nonpadded)";
+        if (!out_layout.data_padding) {
+            out += " " +  out_layout.to_short_string();
        } else {
-            out += "\nl: " + padding.lower_size().to_string() + "\nu: " + padding.upper_size().to_string();
+            out += " " + out_layout.to_string();
        }

        return out;
@@ -218,23 +197,20 @@ void dump_graph_init(std::ofstream& graph,
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wpotentially-evaluated-expression"
 #endif
-        auto& node_type = typeid(*node);
-        std::string node_type_name = get_extr_type(node_type.name());
-        graph << "    " << get_node_id(node) << "[label=\"" << node->id() << ":\n"
-              << node_type_name << "\n out format: " + extr_oformat(node)
-              << "\n out data_type: " + extr_odt(node)
+        std::string node_type_name = node->get_primitive()->type_string();
+        graph << "    " << get_node_id(node) << "[label=\"" << node->id() << ":"
+              << "\\ntype: " << node_type_name
              << "\\nprocessing number: " << program.get_processing_order().get_processing_number(node)
              << "\\n color:" << (node->is_reusing_memory() ? std::to_string(node->get_reused_memory_color()) : "none")
              << (node->can_be_optimized() ? "\\n optimized out" : "");

-        if (node_type_name != "struct cldnn::data" && node_type_name != "struct cldnn::input_layout" &&
-            !node->can_be_optimized()) {
+        if (!node->is_type<data>()) {
            graph << "\\n Selected kernel: "
                  << (node->get_selected_impl() == nullptr ? "none"
                                                           : node->get_selected_impl()->get_kernel_name()) + " / "
-                  << node->get_preferred_impl_type()
-                  << "\n" + dump_mem_info(node);
+                  << node->get_preferred_impl_type();
        }
+        graph << "\n" + dump_mem_info(node);
        graph << "\"";
 #ifdef __clang__
 #pragma clang diagnostic pop
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_base.cpp
@@ -30,7 +30,15 @@ JitConstants ReduceKernelBase::GetJitConstants(const reduce_params& params) cons

    const auto& output = params.outputs[0];
    if (output.is_dynamic()) {
-        size_t output_tensor_offset = 1 + GetFusedPrimitiveInputsCount(params);
+        size_t output_tensor_offset = params.inputs[0].is_dynamic() ? 1 : 0;
+        for (size_t i = 0; i < params.fused_ops.size(); i++) {
+            auto& fused_op_inputs = params.fused_ops[i].tensors;
+
+            for (auto& t : fused_op_inputs) {
+                if (t.is_dynamic())
+                    output_tensor_offset++;
+            }
+        }
        DimensionAccessHelper dims(output, output_tensor_offset);
        jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", toVectorMulString({dims.x,
                                                                                              dims.y,
--- a/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp
@@ -18,8 +18,8 @@ using namespace ::tests;

 namespace {
 struct reduce_test_params {
-    cldnn::tensor in_shape;
-    cldnn::tensor out_shape;
+    ov::PartialShape in_shape;
+    ov::PartialShape out_shape;
    cldnn::data_types data_type;
    cldnn::format input_format;
    data_types default_type;
@@ -34,9 +34,12 @@ struct reduce_test_params {

 class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
 public:
-    void execute(reduce_test_params& p) {
+    void execute(reduce_test_params& p, bool is_dynamic = false) {
        auto input_prim = get_mem(get_input_layout(p));

+        cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
+        cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
+
        network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
        network network_fused(this->engine, this->topology_fused, cfg_fused);

@@ -52,70 +55,60 @@ public:
            if (axis >= static_cast<int64_t>(rank))
                throw std::runtime_error("Unsupported reduce test case");

-            switch (axis) {
-                case 0:  // batch
-                    p.out_shape.batch[0] = 1;
-                    break;
-                case 1:  // feature
-                    p.out_shape.feature[0] = 1;
-                    break;
-                case 2:
-                    p.out_shape.spatial[rank - 3] = 1;
-                    break;
-                case 3:
-                    p.out_shape.spatial[rank - 4] = 1;
-                    break;
-                case 4:
-                    p.out_shape.spatial[rank - 5] = 1;
-                    break;
-                case 5:
-                    p.out_shape.spatial[rank - 6] = 1;
-                    break;
-            }
+            p.out_shape[axis] = 1;
        }
    }

+    layout get_dynamic_input_layout(reduce_test_params& p) {
+        return layout{ ov::PartialShape::dynamic(p.in_shape.size()), p.data_type, p.input_format };
+    }
+
    layout get_input_layout(reduce_test_params& p) {
-        return layout{ p.data_type, p.input_format, p.in_shape };
+        return layout{ p.in_shape, p.data_type, p.input_format };
+    }
+
+    layout get_output_layout(reduce_test_params& p) {
+        return layout{ p.out_shape, p.data_type, p.input_format  };
    }

    layout get_per_channel_layout(reduce_test_params& p) {
-        return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } };
+        return layout{ {1, p.in_shape[1], 1, 1}, p.default_type, p.default_format };
    }
+
 };
 }  // namespace

 /* ----------------------------------------------------------------------------------------------------- */
 /* ---------------------------------------- Reduce cases ----------------------------------------------- */
 /* ----------------------------------------------------------------------------------------------------- */
-#define CASE_REDUCE_F32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_REDUCE_F32_1 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_F32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f32, format::bfzyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_F32_3 { 16, 16, 16, 8, 8, 8 }, { 16, 16, 16, 8, 8, 8 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_1 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_3 { 16, 16, 8, 8, 8, 16 }, { 16, 16, 8, 8, 8, 16 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx
 #define CASE_REDUCE_F32_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx

-#define CASE_REDUCE_F16_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_REDUCE_F16_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_F16_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f16, format::bfzyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_F16_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::f16, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx
 #define CASE_REDUCE_F16_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx

-#define CASE_REDUCE_I32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_REDUCE_I32_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i32, format::bfyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_I32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i32, format::bfzyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_I32_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_I32_4 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::i32, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_4 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx

-#define CASE_REDUCE_I8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_REDUCE_I8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_I8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i8, format::bfzyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_I8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::i8, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx
 #define CASE_REDUCE_I8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx

-#define CASE_REDUCE_U8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 },data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 },data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_REDUCE_U8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_U8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::u8, format::bfzyx, data_types::f32, format::bfyx
-#define CASE_REDUCE_U8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::u8, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx
 #define CASE_REDUCE_U8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx

 class reduce_eltwise_activation_quantize : public ReduceFusingTest {};
@@ -272,6 +265,24 @@ TEST_P(reduce_scale_activation, per_channel) {
    execute(p);
 }

+TEST_P(reduce_scale_activation, dynamic) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_dynamic_input_layout(p)),
+        data("scale_data", get_mem(get_per_channel_layout(p), -0.125f)),
+        reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
+        eltwise("scale", { input_info("reduce"), input_info("scale_data") }, eltwise_mode::prod),
+        activation("activation", input_info("scale"), activation_func::cos),
+        reorder("output_reorder", input_info("activation"), p.default_format, data_types::f32)
+    );
+    // Activation won't be fused because onednn doesn't support cos activation
+    if (engine.get_device_info().supports_immad)
+        p.expected_fused_primitives++;
+
+    tolerance = 1e-02f;
+    execute(p, true);
+}
+
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_scale_activation, ::testing::ValuesIn(std::vector<reduce_test_params>{
    reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::max, { 3, 2, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
    reduce_test_params{ CASE_REDUCE_F32_1, 2, 4, reduce_mode::sum, { 3, 2, 0 }, true, "reduce_ref" },
--- a/src/plugins/intel_gpu/tests/passes/handle_reshape.cpp
+++ b/src/plugins/intel_gpu/tests/passes/handle_reshape.cpp
@@ -11,6 +11,8 @@
 #include "data_inst.h"
 #include "eltwise_inst.h"
 #include "reshape_inst.h"
+#include "reorder_inst.h"
+#include "broadcast_inst.h"
 #include "pass_manager.h"
 #include "to_string_utils.h"

@@ -90,3 +92,104 @@ TEST(handle_reshape, skip_reorder_node_to_split_when_onndnn_not_support) {

    ASSERT_TRUE(prog->get_node("matmul").get_dependency(0).get_output_layout().data_type == data_types::f16);
 }
+
+TEST(handle_reshape, correct_parameters_propagation) {
+    auto& engine = get_test_engine();
+    auto data0_layout = engine.allocate_memory({ ov::PartialShape{}, data_types::f16, format::bfyx });
+    auto data1_layout = engine.allocate_memory({ ov::PartialShape{1, 12}, data_types::f16, format::bfyx });
+    auto in_layout = layout{ ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx };
+
+    topology topology;
+    topology.add(input_layout("input", in_layout));
+    topology.add(data("data0", data0_layout));
+    topology.add(data("data1", data1_layout));
+    topology.add(eltwise("e1", input_info("input"), input_info("data0"), eltwise_mode::sum));
+    topology.add(reshape("reshape", input_info("e1"), false, {2, 12}, {2, 12}));
+    topology.add(eltwise("e2", input_info("reshape"), input_info("data1"), eltwise_mode::sum));
+    topology.add(reorder("reorder", input_info("reshape"), format::bfyx, data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    auto prog = program::build_program(engine, topology, config, false, true);
+
+    layout_optimizer lo(true);
+
+    program_wrapper::apply_opt_pass<handle_reshape>(*prog);
+
+    ASSERT_NE(prog, nullptr);
+    ASSERT_TRUE(has_node_with_type<reshape>(*prog));
+
+    ASSERT_TRUE(prog->get_node("reshape").can_be_optimized());
+
+    auto out_shape0 = prog->get_node("e2").get_output_layout().get_partial_shape();
+    auto out_shape1 = prog->get_node("reorder").get_output_layout().get_partial_shape();
+
+    ov::PartialShape expected_out_shape{2, 12};
+
+    // handle_reshape may do reshape split, so ensure that output shape on all branches is correct
+    ASSERT_EQ(out_shape0, expected_out_shape);
+    ASSERT_EQ(out_shape1, expected_out_shape);
+}
+
+TEST(handle_reshape, reshape_input_reorder) {
+    auto& engine = get_test_engine();
+    auto shape_memory = engine.allocate_memory({ ov::PartialShape{5}, data_types::i32, format::bfyx });
+    auto in0_layout = layout{ ov::PartialShape{1, -1, 16, 64, 64}, data_types::f16, format::bfzyx };
+    auto in0_memory = engine.allocate_memory(layout{ ov::PartialShape{1, 2, 16, 64, 64}, data_types::f16, format::bfzyx });
+    auto in1_layout = layout{ ov::PartialShape{-1, 16, 64, 64}, data_types::f16, format::bfyx };
+    auto in1_memory = engine.allocate_memory({ ov::PartialShape{2, 16, 64, 64}, data_types::f16, format::bfyx });
+
+    auto in0 = generate_random_1d<FLOAT16>(in0_memory->count(), -10, 10);
+    auto in1 = generate_random_1d<FLOAT16>(in1_memory->count(), -10, 10);
+    set_values<FLOAT16>(in0_memory, in0);
+    set_values<int32_t>(shape_memory, {1, 2, 16, 64, 64});
+    set_values<FLOAT16>(in1_memory, in1);
+
+    topology topology;
+    topology.add(input_layout("input0", in0_layout));
+    topology.add(input_layout("target_shape", shape_memory->get_layout()));
+    topology.add(broadcast("broadcast", input_info("input0"), input_info("target_shape"), {}, ov::op::BroadcastType::BIDIRECTIONAL));
+    topology.add(reshape("reshape", input_info("broadcast"), true, {-1, 16, 64, 64}, {-1, 16, 64, 64}));
+    topology.add(input_layout("input1", in1_layout));
+    topology.add(eltwise("eltw", input_info("reshape"), input_info("input1"), eltwise_mode::sum));
+    topology.add(reorder("reorder", input_info("eltw"), format::bfyx, data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    auto prog = program::build_program(engine, topology, config);
+
+    ASSERT_NE(prog, nullptr);
+    ASSERT_TRUE(has_node_with_type<reshape>(*prog));
+
+    ASSERT_TRUE(prog->get_node("reshape").can_be_optimized());
+    auto reshape_layout_in = prog->get_node("reshape").get_input_layouts()[0];
+    auto reshape_layout_out = prog->get_node("reshape").get_output_layout();
+
+    // At this moment transfomations insert reorder before reshape which
+    // converts tensor to default format with rank = reshape_out_rank
+    // Likely in the future we'll update that reorder so it will use reshape_input_rank
+    // After that expected in format will be bfzyx
+    ASSERT_EQ(reshape_layout_in.format, format::bfyx);
+    ASSERT_EQ(reshape_layout_out.format, format::bfyx);
+
+    ov::PartialShape expected_out_shape{-1, 16, 64, 64};
+    ASSERT_EQ(reshape_layout_out.get_partial_shape(), expected_out_shape);
+
+    network net(prog);
+
+    net.set_input_data("input0", in0_memory);
+    net.set_input_data("input1", in1_memory);
+    net.set_input_data("target_shape", shape_memory);
+    auto output = net.execute();
+
+    auto out_mem = output.at("reorder").get_memory();
+    mem_lock<float> lock(out_mem, get_test_stream());
+
+    for (size_t i = 0; i < out_mem->count(); i++) {
+        float expected = static_cast<float>(in0[i]) + static_cast<float>(in1[i]);
+        float actual = lock[i];
+        ASSERT_EQ(expected, actual) << " i = " << i;
+    }
+}