From 05159b8cfb9e0535b3c88af92c17c176039e3467 Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Sun, 18 Dec 2022 20:03:23 -0800
Subject: [PATCH] [GPU] Fixed shape infer dep for blocked format (#14646)

* Fixed shape infer dep for blocked format

* Set preferred format for shape_of to be made from input rank
- ShapeOf should get the original output layout of the previous node, which is not reorderd.
---
 .../remove_redundant_reorders.cpp             |  3 +
 .../graph/graph_optimizer/reorder_inputs.cpp  |  1 -
 .../intel_gpu/src/graph/layout_optimizer.cpp  | 17 ++++
 .../tests/shape_infer/broadcast_si_test.cpp   | 82 +++++++++++++++++++
 4 files changed, 102 insertions(+), 1 deletion(-)
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
index dfc9d6175cb..97cc6b9d772 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -251,6 +251,9 @@ void remove_redundant_reorders::run(program& p) {
         if (!node->is_type<reorder>())  // only care for reorders
             continue;
 
+        if (node->is_dynamic())
+            continue;
+
         auto& r_node = node->as<reorder>();
 
         bool no_output_optimization = remove_output_reorders ?
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
index b9afb144b27..794abdfd072 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -452,7 +452,6 @@ void minimize_local_reorders(program& p, std::map<program_node*, format::type>&
     for (auto node : p.get_processing_order()) {
         if (!node->is_in_data_flow())
             continue;
-
         auto preferred_format = lo.get_preferred_format(*node);
 
         if (preferred_format != format::any) {
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index dca4dec2849..b031e7ade69 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -12,6 +12,7 @@
 #include "resample_inst.h"
 #include "reshape_inst.h"
 #include "arg_max_min_inst.h"
+#include "shape_of_inst.h"
 #include "generic_layer.hpp"
 #include <sstream>
 
@@ -1653,6 +1654,22 @@ format layout_optimizer::get_preferred_format(program_node& node) {
     auto output_layout = node.get_output_layout();
     bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
 
+    bool allow_new_shape_infer = node.get_program().get_options().get<build_option_type::allow_new_shape_infer>()->enabled();
+
+    if (allow_new_shape_infer) {
+        if (node.is_type<shape_of>())
+            return format::get_default_format(node.get_dependency(0).get_output_layout(false).get_rank());
+        for (auto u : node.get_users()) {
+            for (auto dep_idx : u->get_shape_infer_dependencies()) {
+                if (u->get_dependencies().size() <= dep_idx)
+                    continue;
+                if (u->get_dependency(dep_idx).get_unique_id() == node.get_unique_id()) {
+                    expected = format::get_default_format(output_layout.get_rank(), false, false);
+                    return expected;
+                }
+            }
+        }
+    }
     if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
         expected = _forcing_map.at(node.id()).first;
     } else if (node.is_type<convolution>()) {
diff --git a/src/plugins/intel_gpu/tests/shape_infer/broadcast_si_test.cpp b/src/plugins/intel_gpu/tests/shape_infer/broadcast_si_test.cpp
index dfea42e1131..0187666a197 100644
--- a/src/plugins/intel_gpu/tests/shape_infer/broadcast_si_test.cpp
+++ b/src/plugins/intel_gpu/tests/shape_infer/broadcast_si_test.cpp
@@ -6,6 +6,7 @@
 
 #include <intel_gpu/primitives/input_layout.hpp>
 #include <intel_gpu/primitives/broadcast.hpp>
+#include <intel_gpu/primitives/eltwise.hpp>
 #include <intel_gpu/primitives/data.hpp>
 
 #include "broadcast_inst.h"
@@ -81,6 +82,87 @@ INSTANTIATE_TEST_SUITE_P(smoke, broadcast_test_two_inputs,
         }
     }));
 
+class broadcast_test_two_inputs_blocked_format : public testing::TestWithParam<broadcast_test_params> { };
+TEST_P(broadcast_test_two_inputs_blocked_format, shape_infer) {
+    auto p = GetParam();
+
+    auto& engine = get_test_engine();
+
+    auto data_mem = engine.allocate_memory(p.data_layout);
+    auto in1_mem = engine.allocate_memory(p.target_shape_layout);
+    auto in2_mem = engine.allocate_memory(p.target_shape_layout);
+
+    // data ------------|
+    // shape1 (blocked)- eltwise (plain)-- broadcast
+    // shape2 (blocked) /
+    // Expectation: eltwise's result is to be used as shape_mem of broadcast, and it should be plain format
+    topology topology;
+    topology.add(input_layout("data", layout{ov::PartialShape::dynamic(p.data_layout.get_rank()), p.data_layout.data_type, p.data_layout.format}),
+                input_layout("shape_input_1", layout{ov::PartialShape::dynamic(p.target_shape_layout.get_rank()), p.target_shape_layout.data_type, p.target_shape_layout.format}),
+                input_layout("shape_input_2", layout{ov::PartialShape::dynamic(p.target_shape_layout.get_rank()), p.target_shape_layout.data_type, p.target_shape_layout.format}),
+                eltwise("target_shape", input_info("shape_input_1"), input_info("shape_input_2"), eltwise_mode::sum, ov::op::AutoBroadcastType::NUMPY),
+                broadcast("output", input_info("data"), input_info("target_shape"), p.axes_mapping_data, p.mode)
+    );
+
+    build_options options;
+    options.set_option(build_option::optimize_data(true));
+    options.set_option(build_option::allow_new_shape_infer(true));
+
+    std::vector<int32_t> input_data(p.data_layout.get_linear_size(), 1);
+
+    network network(engine, topology, options);
+
+    set_values(data_mem, input_data);
+    set_values(in1_mem, p.target_shape_data);
+    set_values(in2_mem, p.target_shape_data);
+
+    network.set_input_data("data", data_mem);
+    network.set_input_data("shape_input_1", in1_mem);
+    network.set_input_data("shape_input_2", in2_mem);
+
+    auto outputs = network.execute();
+    auto output = outputs.at("output").get_memory();
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    ASSERT_EQ(output->get_layout(), p.expected_layout);
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke, broadcast_test_two_inputs_blocked_format,
+    testing::ValuesIn(std::vector<broadcast_test_params>{
+        {
+            layout{ov::PartialShape{8}, data_types::i32, format::b_fs_yx_fsv16}, //data layout
+            layout{ov::PartialShape{4}, data_types::i64, format::b_fs_yx_fsv16},
+            {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+            {0}, ov::op::BroadcastType::EXPLICIT,
+            layout{ov::PartialShape{8, 64, 22, 16}, data_types::i32, format::b_fs_yx_fsv16}
+        },
+        {
+            layout{ov::PartialShape{16, 1, 1, 1}, data_types::i32, format::b_fs_yx_fsv16}, //data layout
+            layout{ov::PartialShape{4}, data_types::i64, format::b_fs_yx_fsv16},
+            {8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+            {}, ov::op::BroadcastType::NUMPY,
+            layout{ov::PartialShape{16, 50, 24, 20}, data_types::i32, format::b_fs_yx_fsv16}
+        },
+        {
+            layout{ov::PartialShape{16}, data_types::i32, format::b_fs_zyx_fsv16}, //data layout
+            layout{ov::PartialShape{5}, data_types::i64, format::b_fs_zyx_fsv16},
+            {8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+            {0}, ov::op::BroadcastType::EXPLICIT,
+            layout{ov::PartialShape{16, 2, 50, 24, 20}, data_types::i32, format::b_fs_zyx_fsv16}
+        }
+    }));
+
+
 class broadcast_test_single_input : public testing::TestWithParam<broadcast_test_params> { };
 
 TEST_P(broadcast_test_single_input, shape_infer) {