[IE CLDNN] Fuse reorder to permute. (#4549)

This commit is contained in:
Taylor Yeonbok Lee 2021-03-09 04:48:06 +09:00 committed by GitHub
parent 7f8d3aa638
commit e2c15a18ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 210 additions and 32 deletions

View File

@ -242,6 +242,11 @@ bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params
return false;
}
if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
// Reorder cannot be fused
return false;
}
if (!is_rotating_except_batch(params.order)) {
return false;
}

View File

@ -25,6 +25,7 @@
#include "reshape_inst.h"
#include "one_hot_inst.h"
#include "permute_inst.h"
using namespace cldnn;
@ -278,45 +279,46 @@ void remove_redundant_reorders::run(program_impl& p) {
// This pass removed reorder if previous node can store directly to required layout
itr = p.get_processing_order().begin();
while (itr != p.get_processing_order().end()) {
auto& node_ptr = *itr++;
if (!node_ptr->is_type<reorder>()) // only care for reorders
continue;
if (enable_reorder_fusing) {
while (itr != p.get_processing_order().end()) {
auto& node_ptr = *itr++;
if (!node_ptr->is_type<reorder>()) // only care for reorders
continue;
auto& node = node_ptr->as<reorder>();
auto& node = node_ptr->as<reorder>();
auto& input = node.input();
auto output_layout = node.get_output_layout();
auto& input = node.input();
auto output_layout = node.get_output_layout();
if (node.is_output())
continue;
if (node.is_output())
continue;
if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
continue;
if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
continue;
if (!node.get_fused_activations_funcs().empty())
continue;
if (!node.get_fused_activations_funcs().empty())
continue;
if (input.get_users().size() != 1 || node.get_users().empty())
continue;
if (input.get_users().size() != 1 || node.get_users().empty())
continue;
bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
bool allowed_dt_conversion_fuse = input.is_type<one_hot>();
if (!same_data_type && !allowed_dt_conversion_fuse)
continue;
bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
bool allowed_dt_conversion_fuse = (input.is_type<one_hot>()) || (input.is_type<permute>());
if (!same_data_type && !allowed_dt_conversion_fuse)
continue;
if (!lo.can_fuse_reorder_to_prev(input, *node.get_users().front(), input.get_output_layout().format, output_layout.format))
continue;
if (!lo.can_fuse_reorder_to_prev(input, *node.get_users().front(), input.get_output_layout().format, output_layout.format))
continue;
input.set_output_layout(output_layout, false);
if (input.type()->does_possible_implementation_exist(p.get_engine(), input)) {
p.replace_all_usages(node, input);
p.add_optimized_primitive_info(node.id());
p.remove_all_connections(node);
p.remove_if_dangling(node);
input.set_output_layout(output_layout, false);
if (input.type()->does_possible_implementation_exist(p.get_engine(), input)) {
p.replace_all_usages(node, input);
p.add_optimized_primitive_info(node.id());
p.remove_all_connections(node);
p.remove_if_dangling(node);
}
}
}
// This pass removed reorder if the next node supports reorder's input format and data type doesn't change
itr = p.get_processing_order().begin();
while (itr != p.get_processing_order().end()) {

View File

@ -248,10 +248,12 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
return true;
if (prev.is_type<permute>())
return true;
return false;
}
namespace {
bool should_use_winograd_2x3_s1(std::shared_ptr<const convolution> const& prim,
layout const& input_layout,

View File

@ -164,7 +164,7 @@ public:
bo_not_fused.set_option(build_option::allow_static_input_reorder(true));
}
void compare(network& not_fused, network& fused, T& p) {
void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) {
auto outputs_ref = not_fused.execute();
auto outputs_fused = fused.execute();
@ -199,8 +199,8 @@ public:
SCOPED_TRACE(description.str());
// Subtract reorders count to handle execution in different layouts when input/output reorders can be added in the graph
ASSERT_EQ(fused.get_executed_primitives().size() - reorders_count_fused, p.expected_fused_primitives);
ASSERT_EQ(not_fused.get_executed_primitives().size() - reorders_count_not_fused, p.expected_not_fused_primitives);
ASSERT_EQ(fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_fused), p.expected_fused_primitives);
ASSERT_EQ(not_fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_not_fused), p.expected_not_fused_primitives);
ASSERT_EQ(outputs_ref.size(), outputs_fused.size());
ASSERT_EQ(outputs_ref.size(), size_t(1));
@ -6362,6 +6362,84 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv,
permute_params{CASE_PERMUTE_TILE_8x8_4x4_6D_3, 2, 7},
}), );
/* ------------------------------------------------------------------------------------------------------------ */
/* ---------------------------- PERMUTE FUSE REDUNDANT REORDER cases ------------------------------------------ */
/* ------------------------------------------------------------------------------------------------------------ */
struct permute_reorder_params {
tensor in_shape;
tensor out_shape;
std::vector<uint16_t> permute_order1;
std::vector<uint16_t> permute_order2;
data_types permute_type;
data_types output_type;
format permute_format;
format output_format;
size_t expected_fused_primitives;
size_t expected_not_fused_primitives;
};
#define CASE_PERMUTE_REORDER_F32_0 {1, 16, 32, 2}, {1, 16, 32, 2}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F32_1 {2, 16, 16, 16}, {2, 16, 16, 16}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_F32_2 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_0 {1, 16, 2, 4}, {1, 16, 2, 4}, {0, 2, 1, 3}, {0, 2, 1, 3}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F16_1 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_2 {1, 5, 1, 2, 14}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_S8_0 {1, 15, 4, 5}, {1, 15, 4, 5}, {0, 2, 3, 1}, {0, 3, 1, 2}, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_S8_1 {1, 2, 15, 4, 5}, {1, 2, 15, 4, 5}, {0, 2, 4, 1, 3}, {0, 3, 1, 4, 2}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F32_TO_F16_0 {1, 5, 1, 2, 14}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_S8_TO_F32_0 {1, 18, 1, 2, 2}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_U8_TO_F16_0 {1, 17, 1, 2, 7}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
class PermuteReorderFusingTest : public ::BaseFusingTest<permute_reorder_params> {
public:
void execute(permute_reorder_params& p) {
auto input_prim = get_mem(get_input_layout(p));
network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
compare(network_not_fused, network_fused, p, true);
}
layout get_input_layout(permute_reorder_params& p) {
return layout{ p.permute_type, p.permute_format, p.in_shape, padding{} };
}
};
class permute_redundant_reorder : public PermuteReorderFusingTest {};
TEST_P(permute_redundant_reorder, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
permute("permute1", "input", p.permute_order1),
reorder("reorder1", "permute1", p.output_format, p.output_type), // to be fused
permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder n
);
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_redundant_reorder,
::testing::ValuesIn(std::vector<permute_reorder_params> {
permute_reorder_params{CASE_PERMUTE_REORDER_F32_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F32_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F32_2, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F16_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F16_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F16_2, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_S8_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_S8_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4},
}),);
class NormalizeFusingTest : public ::BaseFusingTest<normalize_test_params> {
public:
void execute(normalize_test_params& p) {

View File

@ -517,6 +517,97 @@ TEST(permute_gpu_i64, basic_bfyx_permute_0_1_3_2) {
permute_test_with_reorder<data_types::i64>();
}
TEST(permute_fuse_reorder_gpu_f32, basic_b_fs_yx_fsv4_permute_1_8_16_1)
{
// Input : bfyx:1x32x1x2
// Permute1 order : {0, 3, 1, 2}
// Permute2 order : {0, 2, 3, 1}
const auto& engine = get_test_engine();
auto input = memory::allocate(engine, { data_types::f32, format::bfyx, {1, 8, 1, 16}});
std::vector<float> values = {
0.0f, 1.0f, 2.0f, 3.0f,
4.0f, 5.0f, 6.0f, 7.0f,
8.0f, 9.0f, 10.0f, 11.0f,
12.0f, 13.0f, 14.0f, 15.0f,
16.0f, 17.0f, 18.0f, 19.0f,
20.0f, 21.0f, 22.0f, 23.0f,
24.0f, 25.0f, 26.0f, 27.0f,
28.0f, 29.0f, 30.0f, 31.0f,
32.0f, 33.0f, 34.0f, 35.0f,
36.0f, 37.0f, 38.0f, 39.0f,
40.0f, 41.0f, 42.0f, 43.0f,
44.0f, 45.0f, 46.0f, 47.0f,
48.0f, 49.0f, 50.0f, 51.0f,
52.0f, 53.0f, 54.0f, 55.0f,
56.0f, 57.0f, 58.0f, 59.0f,
60.0f, 61.0f, 62.0f, 63.0f,
64.0f, 65.0f, 66.0f, 67.0f,
68.0f, 69.0f, 70.0f, 71.0f,
72.0f, 73.0f, 74.0f, 75.0f,
76.0f, 77.0f, 78.0f, 79.0f,
80.0f, 81.0f, 82.0f, 83.0f,
84.0f, 85.0f, 86.0f, 87.0f,
88.0f, 89.0f, 90.0f, 91.0f,
92.0f, 93.0f, 94.0f, 95.0f,
96.0f, 97.0f, 98.0f, 99.0f,
100.0f, 101.0f, 102.0f, 103.0f,
104.0f, 105.0f, 106.0f, 107.0f,
108.0f, 109.0f, 110.0f, 111.0f,
112.0f, 113.0f, 114.0f, 115.0f,
116.0f, 117.0f, 118.0f, 119.0f,
120.0f, 121.0f, 122.0f, 123.0f,
124.0f, 125.0f, 126.0f, 127.0f
};
set_values(input, values);
// unfused
topology topology_unfused(
input_layout("input", input.get_layout()),
reorder("reorder1", "input", format::b_fs_yx_fsv4, data_types::f32),
permute("permute", "reorder1", { 0, 3, 1, 2}),
reorder("reorder2", "permute", format::bfyx, data_types::f32),
permute("out", "reorder2", { 0, 2, 3, 1}));
cldnn::build_options options_unfused;
options_unfused.set_option(cldnn::build_option::optimize_data(false));
options_unfused.set_option(cldnn::build_option::allow_static_input_reorder(true));
network unfused(engine, topology_unfused, options_unfused);
unfused.set_input_data("input", input);
// fused network
topology topology_fused(
input_layout("input", input.get_layout()),
reorder("reorder1", "input", format::b_fs_yx_fsv4, data_types::f32),
permute("permute", "reorder1", { 0, 3, 1, 2}),
reorder("reorder2", "permute", format::bfyx, data_types::f32), // to be fused to previous permute
permute("out", "reorder2", { 0, 2, 3, 1})); // return to original value
cldnn::build_options options_fused;
options_fused.set_option(cldnn::build_option::optimize_data(true));
network fused(engine, topology_fused, options_fused);
fused.set_input_data("input", input);
auto outputs_fused = fused.execute();
auto outputs_unfused = unfused.execute();
auto output_fused = outputs_fused.begin()->second.get_memory();
auto output_fused_ptr = output_fused.pointer<float>();
auto output_unfused = outputs_unfused.begin()->second.get_memory();
auto output_unfused_ptr = output_unfused.pointer<float>();
EXPECT_EQ(output_fused.get_layout().format, cldnn::format::bfyx);
EXPECT_EQ(output_unfused.get_layout().format, cldnn::format::bfyx);
EXPECT_EQ(fused.get_executed_primitives().size(), 4);
EXPECT_EQ(unfused.get_executed_primitives().size(), 5);
for (size_t i = 0; i < values.size(); i++)
{
EXPECT_FLOAT_EQ(output_unfused_ptr[i], output_fused_ptr[i]);
EXPECT_FLOAT_EQ(output_unfused_ptr[i], values[i]);
}
}
TEST(fc_permute_crop_gpu, basic_permute_yxfb)
{
const auto& engine = get_test_engine();