[GPU] Fix to skip reorder optimization during post_optimize_graph phase (#16908)

* [GPU] Fix to skip reorder optimization during post_optimize_graph phase

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Apply comment

Signed-off-by: Andrew Park <andrew.park@intel.com>

* update condition to check empty padding

Signed-off-by: Andrew Park <andrew.park@intel.com>

* add condition to check batch size

Signed-off-by: Andrew Park <andrew.park@intel.com>

---------

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park 2023-04-15 11:24:06 +09:00 committed by GitHub
parent 824a5aa7fb
commit 507b3251ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 55 additions and 13 deletions

View File

@ -284,24 +284,29 @@ void remove_redundant_reorders::run(program& p) {
i_layout.data_padding.upper_size().spatial[1] == 0 && i_layout.data_padding.lower_size().spatial[1] == 0 &&
o_layout.data_padding.upper_size() == (tensor)0 && o_layout.data_padding.lower_size() == (tensor)0 &&
i_layout.data_type == o_layout.data_type) {
r_node.can_be_optimized(true);
r_node.requires_reinterpret(true);
// If the newly aligned pad is merged into output layout during post_optimize_graph phase
// and then buffer is reinterpreted, user node cannot handle pad properly for kernel execution
if (!update_implementations || (i_layout.feature() % 16 == 0 &&
i_layout.data_padding == padding() && o_layout.data_padding == padding()) || i_layout.batch() == 1) {
r_node.can_be_optimized(true);
r_node.requires_reinterpret(true);
auto pad_lo = o_layout.data_padding.lower_size();
auto pad_hi = o_layout.data_padding.upper_size();
auto pad_lo = o_layout.data_padding.lower_size();
auto pad_hi = o_layout.data_padding.upper_size();
pad_lo.batch[0] = i_layout.data_padding.lower_size().batch[0];
pad_hi.batch[0] = i_layout.data_padding.upper_size().batch[0];
pad_lo.batch[0] = i_layout.data_padding.lower_size().batch[0];
pad_hi.batch[0] = i_layout.data_padding.upper_size().batch[0];
pad_lo.feature[0] = i_layout.data_padding.lower_size().feature[0];
pad_hi.feature[0] = i_layout.data_padding.upper_size().feature[0];
pad_lo.feature[0] = i_layout.data_padding.lower_size().feature[0];
pad_hi.feature[0] = i_layout.data_padding.upper_size().feature[0];
if (i_layout.feature() % 16 != 0) {
pad_hi.feature[0] += 16 - i_layout.feature() % 16;
if (i_layout.feature() % 16 != 0) {
pad_hi.feature[0] += 16 - i_layout.feature() % 16;
}
r_node.merge_output_padding(padding{pad_lo.sizes(), pad_hi.sizes()});
continue;
}
r_node.merge_output_padding(padding{pad_lo.sizes(), pad_hi.sizes()});
continue;
}
if (!o_layout.compatible(i_layout))

View File

@ -12,6 +12,8 @@
#include "convolution_inst.h"
#include "reorder_inst.h"
#include "softmax_inst.h"
#include "reduce_inst.h"
#include "fully_connected_inst.h"
#include "pass_manager.h"
#include "to_string_utils.h"
@ -60,3 +62,38 @@ TEST(remove_redundant_reorders, remove_dep_dynamic) {
ASSERT_EQ(softmax_layout.format.value, format::bfyx);
}
TEST(remove_redundant_reorders, optimize_fsv16_to_bfyx) {
// Topology:
// reorder(b_fs_yx_fsv16) -> reduce(b_fs_yx_fsv16) -> fully_connected(bfyx)
//
// Expectation:
// Reorder that converts b_fs_yx_fsv16 to bfyx is added between reduce and fc (add_required_reorders)
// If it is post_optimize_graph phase and the batch size of reorder output layout is not 1,
// reorder optimization (b_fs_yx_fsv16->bfyx when spatials are eqaul to 1) is skipped (remove_redundant_reorders)
// So there should be no upper padding for feature dim of FC's input layout
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ data_types::f32, format::bfyx, { 2, 1080, 7, 7 } });
auto weights = engine.allocate_memory({ data_types::f32, format::bfyx, { 1001, 1080, 1, 1 } });
topology topology;
topology.add(data("weights", weights));
topology.add(input_layout("input", input->get_layout()));
topology.add(reorder("reorder", input_info("input"), format::b_fs_yx_fsv16, data_types::f32));
topology.add(reduce("reduce", input_info("reorder"), reduce_mode::min, {2, 3}, true));
topology.add(fully_connected("fc", input_info("reduce"), "weights"));
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(engine, topology, config);
network.set_input_data("input", input);
network.execute();
auto prog = network.get_program();
ASSERT_NE(prog, nullptr);
auto& fc_node = prog->get_node("fc");
auto fc_in_layout = fc_node.get_input_layouts();
ASSERT_EQ(fc_in_layout.front().data_padding.upper_size().feature[0], 0);
}