From b98900859bb62ef442a8cef636e42fc95454f84e Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Tue, 17 Jan 2023 21:41:11 -0800
Subject: [PATCH] [GPU] Let reorder_inputs pass to use not only output_layout
 but also input layout for dynamic shape (#15037)

Previously, reorder_input pass checked only output layouts of each node, with an assumption that the input/output ranks are same at that pass.
However in dynamic shape using ngraph shape infer, there are cases input / output ranks differ. In such cases, the reorder_input pass inserts reorder to the format of current node's output_layout in the input of the current node, which causes error.
Fixed the above behavior by applying set_preferred_input/output_layout.
---
 .../src/graph/graph_optimizer/reorder_inputs.cpp   |  6 ++++--
 .../intel_gpu/src/graph/impls/ocl/gather.cpp       | 14 ++++++++++++++
 .../intel_gpu/src/graph/layout_optimizer.cpp       | 11 +++++++++++
 .../kernels/gather/gather_kernel_ref.cpp           |  7 ++++---
 .../gpu/single_layer_tests/dynamic/gather.cpp      |  7 +++++--
 5 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
index 5992927df9b..9bd167eb70a 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -113,8 +113,9 @@ struct travel_direction_wrapper<direction_e::backwards> {
 static format get_target_output_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node, program_node *next) {
     auto user_idx = node->get_user_index(*next);
 
+    bool allow_new_shape_infer = node->get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
     // 1. Check selected preferred_output_format
-    if (lo.get_optimization_attributes().use_onednn_impls) {
+    if (lo.get_optimization_attributes().use_onednn_impls || allow_new_shape_infer) {
         // If onednn is not used, need to ignore get_preferred_output_fmt result as it is from onednn
         auto ret = node->get_preferred_output_fmt(user_idx);
 
@@ -133,8 +134,9 @@ static format get_target_output_format(layout_optimizer& lo, const std::map<prog
 static format get_target_input_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node, program_node *prev) {
     auto dep_idx = node->get_dependency_index(*prev);
 
+    bool allow_new_shape_infer = node->get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
     // 1. Check selected preferred_input_format
-    if (lo.get_optimization_attributes().use_onednn_impls) {
+    if (lo.get_optimization_attributes().use_onednn_impls || allow_new_shape_infer) {
         // If onednn is not used, need to ignore get_preferred_input_fmt result as it is from onednn
         auto ret = node->get_preferred_input_fmt(dep_idx);
         if (ret != format::any)
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp
index c5dab2f9bb1..b251ebc4e6d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather.cpp
@@ -81,7 +81,21 @@ public:
         params.axis = convert_axis(primitive->axis, input_layout.get_rank());
         params.batch_dim = size_t(primitive->batch_dim);
         params.support_neg_ind = primitive->support_neg_ind;
+        auto output_layout = impl_param.get_output_layout(0);
+        auto in_rank = impl_param.get_input_layout(0).get_rank();
+        auto out_rank = impl_param.get_output_layout(0).get_rank();
+        if (in_rank > 4 && in_rank > out_rank) { // if in_rank <= 4, the dims are to be adjusted to 4 by convert_data_tensor
+            auto output_shape = impl_param.get_output_layout(0).get_partial_shape();
+            ov::PartialShape new_output_shape({output_shape[0], output_shape[1]});
+            for (size_t i = 0; i < in_rank - out_rank; ++i)
+                new_output_shape.push_back(1);
 
+            for (size_t i = 2; i < out_rank; ++i) {
+                new_output_shape.push_back(output_shape[i]);
+            }
+            output_layout = layout(new_output_shape, impl_param.get_output_layout(0).data_type, format::get_default_format(new_output_shape.size()));
+        }
+        params.outputs[0] = convert_data_tensor(output_layout);
         params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
         return {params, optional_params};
     }
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index da02e5389d3..5676f8717f2 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -1673,6 +1673,14 @@ format layout_optimizer::get_preferred_format(program_node& node) {
     if (allow_new_shape_infer) {
         if (node.is_type<shape_of>())
             return format::get_default_format(node.get_dependency(0).get_output_layout(false).get_rank());
+
+        // Let reorder_input pass to check input format instead of output_format in forward investigation, vice versa
+        auto out_lay_rank = node.get_output_layout(false).get_rank();
+        auto in_lay_rank = node.get_dependencies().size() > 0 ? node.get_dependency(0).get_output_layout(false).get_rank() : out_lay_rank;
+        if (in_lay_rank != out_lay_rank)
+            node.set_preferred_input_fmt(0, get_preferred_format(node.get_dependency(0)));
+
+        // shape_infer_dep should be plain format because the memory is being read by ngraph shape infer as is
         for (auto u : node.get_users()) {
             for (auto dep_idx : u->get_shape_infer_dependencies()) {
                 if (u->get_dependencies().size() <= dep_idx)
@@ -1828,6 +1836,9 @@ format layout_optimizer::get_preferred_format(program_node& node) {
         expected = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false);
     }
 
+    if (allow_new_shape_infer && node.get_preferred_input_fmt() != format::any) {
+        node.set_preferred_output_fmt(0, expected);
+    }
     return expected;
 }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.cpp
index 02878916ad9..c5dc9a2db62 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.cpp
@@ -144,9 +144,10 @@ static std::string GetDictionaryIndexOrder(const gather_params& params, size_t a
         idx_order[i] = zero_val;
 
     // Fix size to inputs[0] dims size
-    for (size_t i = 0; i < params.outputs[0].GetDims().size() - params.inputs[0].GetDims().size(); i++)
-        idx_order.pop_back();
-
+    if (params.outputs[0].GetDims().size() > params.inputs[0].GetDims().size()) {
+        for (size_t i = 0; i < params.outputs[0].GetDims().size() - params.inputs[0].GetDims().size(); i++)
+            idx_order.pop_back();
+    }
     idx_order[axis] = input_axis_index_macro;
 
     return GetOrderString(idx_order);
diff --git a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/gather.cpp b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/gather.cpp
index acd18a2b67c..4dbd78c6fd0 100644
--- a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/gather.cpp
+++ b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/gather.cpp
@@ -184,13 +184,16 @@ const std::vector<GatherShapeParams> dynamicInputShapeConstTargetShape = {
         ov::test::InputShape(ov::PartialShape({}), {{3, 4, 3}}),
         3, 2
     },
-    #if 0 // TODO (99432) 5D=>4D test does not work properly because of the current reorder_impl logic does not work as expected.
     {
         ov::test::InputShape(ov::PartialShape({-1, -1, -1, -1, -1}), {{2, 4, 2, 2, 3}, {2, 4, 8, 9, 10}}),
         ov::test::InputShape(ov::PartialShape({}), {{2, 4}}),
         2, 2
     },
-    #endif
+    {
+        ov::test::InputShape(ov::PartialShape({-1, -1, -1, -1, -1, -1}), {{2, 4, 2, 3, 1, 3}, {2, 4, 7, 8, 9, 10}}),
+        ov::test::InputShape(ov::PartialShape({}), {{2, 4}}),
+        2, 2
+    },
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_dynamic_input_shapes_const_target_shapes, GatherGPUTest,