[GPU] Fix to skip reorder optimization during post_optimize_graph phase (#16908)

* [GPU] Fix to skip reorder optimization during post_optimize_graph phase Signed-off-by: Andrew Park <andrew.park@intel.com> * Apply comment Signed-off-by: Andrew Park <andrew.park@intel.com> * update condition to check empty padding Signed-off-by: Andrew Park <andrew.park@intel.com> * add condition to check batch size Signed-off-by: Andrew Park <andrew.park@intel.com> --------- Signed-off-by: Andrew Park <andrew.park@intel.com>
2023-04-15 11:24:06 +09:00 · 2023-04-15 11:24:06 +09:00 · 507b3251ef
commit 507b3251ef
parent 824a5aa7fb
2 changed files with 55 additions and 13 deletions
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@ -284,24 +284,29 @@ void remove_redundant_reorders::run(program& p) {
            i_layout.data_padding.upper_size().spatial[1] == 0 && i_layout.data_padding.lower_size().spatial[1] == 0 &&
            o_layout.data_padding.upper_size() == (tensor)0 && o_layout.data_padding.lower_size() == (tensor)0 &&
            i_layout.data_type == o_layout.data_type) {
-            r_node.can_be_optimized(true);
-            r_node.requires_reinterpret(true);
+            // If the newly aligned pad is merged into output layout during post_optimize_graph phase
+            // and then buffer is reinterpreted, user node cannot handle pad properly for kernel execution
+            if (!update_implementations || (i_layout.feature() % 16 == 0 &&
+                i_layout.data_padding == padding() && o_layout.data_padding == padding()) || i_layout.batch() == 1) {
+                r_node.can_be_optimized(true);
+                r_node.requires_reinterpret(true);

-            auto pad_lo = o_layout.data_padding.lower_size();
-            auto pad_hi = o_layout.data_padding.upper_size();
+                auto pad_lo = o_layout.data_padding.lower_size();
+                auto pad_hi = o_layout.data_padding.upper_size();

-            pad_lo.batch[0] = i_layout.data_padding.lower_size().batch[0];
-            pad_hi.batch[0] = i_layout.data_padding.upper_size().batch[0];
+                pad_lo.batch[0] = i_layout.data_padding.lower_size().batch[0];
+                pad_hi.batch[0] = i_layout.data_padding.upper_size().batch[0];

-            pad_lo.feature[0] = i_layout.data_padding.lower_size().feature[0];
-            pad_hi.feature[0] = i_layout.data_padding.upper_size().feature[0];
+                pad_lo.feature[0] = i_layout.data_padding.lower_size().feature[0];
+                pad_hi.feature[0] = i_layout.data_padding.upper_size().feature[0];

-            if (i_layout.feature() % 16 != 0) {
-                pad_hi.feature[0] += 16 - i_layout.feature() % 16;
+                if (i_layout.feature() % 16 != 0) {
+                    pad_hi.feature[0] += 16 - i_layout.feature() % 16;
+                }
+
+                r_node.merge_output_padding(padding{pad_lo.sizes(), pad_hi.sizes()});
+                continue;
            }
-
-            r_node.merge_output_padding(padding{pad_lo.sizes(), pad_hi.sizes()});
-            continue;
        }

        if (!o_layout.compatible(i_layout))
--- a/src/plugins/intel_gpu/tests/passes/remove_redundant_reorders_tests.cpp
+++ b/src/plugins/intel_gpu/tests/passes/remove_redundant_reorders_tests.cpp
@ -12,6 +12,8 @@
 #include "convolution_inst.h"
 #include "reorder_inst.h"
 #include "softmax_inst.h"
+#include "reduce_inst.h"
+#include "fully_connected_inst.h"

 #include "pass_manager.h"
 #include "to_string_utils.h"
@ -60,3 +62,38 @@ TEST(remove_redundant_reorders, remove_dep_dynamic) {

    ASSERT_EQ(softmax_layout.format.value, format::bfyx);
 }
+
+TEST(remove_redundant_reorders, optimize_fsv16_to_bfyx) {
+    // Topology:
+    // reorder(b_fs_yx_fsv16) -> reduce(b_fs_yx_fsv16) -> fully_connected(bfyx)
+    //
+    // Expectation:
+    // Reorder that converts b_fs_yx_fsv16 to bfyx is added between reduce and fc (add_required_reorders)
+    // If it is post_optimize_graph phase and the batch size of reorder output layout is not 1,
+    // reorder optimization (b_fs_yx_fsv16->bfyx when spatials are eqaul to 1) is skipped (remove_redundant_reorders)
+    // So there should be no upper padding for feature dim of FC's input layout
+
+    auto& engine = get_test_engine();
+    auto input = engine.allocate_memory({ data_types::f32, format::bfyx, { 2, 1080, 7, 7 } });
+    auto weights = engine.allocate_memory({ data_types::f32, format::bfyx, { 1001, 1080, 1, 1 } });
+
+    topology topology;
+    topology.add(data("weights", weights));
+    topology.add(input_layout("input", input->get_layout()));
+    topology.add(reorder("reorder", input_info("input"), format::b_fs_yx_fsv16, data_types::f32));
+    topology.add(reduce("reduce", input_info("reorder"), reduce_mode::min, {2, 3}, true));
+    topology.add(fully_connected("fc", input_info("reduce"), "weights"));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+    network.set_input_data("input", input);
+
+    network.execute();
+
+    auto prog = network.get_program();
+    ASSERT_NE(prog, nullptr);
+    auto& fc_node = prog->get_node("fc");
+    auto fc_in_layout = fc_node.get_input_layouts();
+    ASSERT_EQ(fc_in_layout.front().data_padding.upper_size().feature[0], 0);
+}