[IE CLDNN] Fuse reorder to permute. (#4549)
This commit is contained in:
parent
7f8d3aa638
commit
e2c15a18ff
@ -242,6 +242,11 @@ bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params
|
||||
return false;
|
||||
}
|
||||
|
||||
if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
|
||||
// Reorder cannot be fused
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!is_rotating_except_batch(params.order)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -25,6 +25,7 @@
|
||||
|
||||
#include "reshape_inst.h"
|
||||
#include "one_hot_inst.h"
|
||||
#include "permute_inst.h"
|
||||
|
||||
using namespace cldnn;
|
||||
|
||||
@ -278,45 +279,46 @@ void remove_redundant_reorders::run(program_impl& p) {
|
||||
|
||||
// This pass removed reorder if previous node can store directly to required layout
|
||||
itr = p.get_processing_order().begin();
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
auto& node_ptr = *itr++;
|
||||
if (!node_ptr->is_type<reorder>()) // only care for reorders
|
||||
continue;
|
||||
if (enable_reorder_fusing) {
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
auto& node_ptr = *itr++;
|
||||
if (!node_ptr->is_type<reorder>()) // only care for reorders
|
||||
continue;
|
||||
|
||||
auto& node = node_ptr->as<reorder>();
|
||||
auto& node = node_ptr->as<reorder>();
|
||||
|
||||
auto& input = node.input();
|
||||
auto output_layout = node.get_output_layout();
|
||||
auto& input = node.input();
|
||||
auto output_layout = node.get_output_layout();
|
||||
|
||||
if (node.is_output())
|
||||
continue;
|
||||
if (node.is_output())
|
||||
continue;
|
||||
|
||||
if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
|
||||
continue;
|
||||
if (node.has_mean() || !node.get_primitive()->subtract_per_feature.empty())
|
||||
continue;
|
||||
|
||||
if (!node.get_fused_activations_funcs().empty())
|
||||
continue;
|
||||
if (!node.get_fused_activations_funcs().empty())
|
||||
continue;
|
||||
|
||||
if (input.get_users().size() != 1 || node.get_users().empty())
|
||||
continue;
|
||||
if (input.get_users().size() != 1 || node.get_users().empty())
|
||||
continue;
|
||||
|
||||
bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
|
||||
bool allowed_dt_conversion_fuse = input.is_type<one_hot>();
|
||||
if (!same_data_type && !allowed_dt_conversion_fuse)
|
||||
continue;
|
||||
bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
|
||||
bool allowed_dt_conversion_fuse = (input.is_type<one_hot>()) || (input.is_type<permute>());
|
||||
if (!same_data_type && !allowed_dt_conversion_fuse)
|
||||
continue;
|
||||
|
||||
if (!lo.can_fuse_reorder_to_prev(input, *node.get_users().front(), input.get_output_layout().format, output_layout.format))
|
||||
continue;
|
||||
if (!lo.can_fuse_reorder_to_prev(input, *node.get_users().front(), input.get_output_layout().format, output_layout.format))
|
||||
continue;
|
||||
|
||||
input.set_output_layout(output_layout, false);
|
||||
if (input.type()->does_possible_implementation_exist(p.get_engine(), input)) {
|
||||
p.replace_all_usages(node, input);
|
||||
p.add_optimized_primitive_info(node.id());
|
||||
p.remove_all_connections(node);
|
||||
p.remove_if_dangling(node);
|
||||
input.set_output_layout(output_layout, false);
|
||||
if (input.type()->does_possible_implementation_exist(p.get_engine(), input)) {
|
||||
p.replace_all_usages(node, input);
|
||||
p.add_optimized_primitive_info(node.id());
|
||||
p.remove_all_connections(node);
|
||||
p.remove_if_dangling(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This pass removed reorder if the next node supports reorder's input format and data type doesn't change
|
||||
itr = p.get_processing_order().begin();
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
|
@ -248,10 +248,12 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
|
||||
fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
|
||||
return true;
|
||||
|
||||
if (prev.is_type<permute>())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
bool should_use_winograd_2x3_s1(std::shared_ptr<const convolution> const& prim,
|
||||
layout const& input_layout,
|
||||
|
@ -164,7 +164,7 @@ public:
|
||||
bo_not_fused.set_option(build_option::allow_static_input_reorder(true));
|
||||
}
|
||||
|
||||
void compare(network& not_fused, network& fused, T& p) {
|
||||
void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) {
|
||||
auto outputs_ref = not_fused.execute();
|
||||
auto outputs_fused = fused.execute();
|
||||
|
||||
@ -199,8 +199,8 @@ public:
|
||||
SCOPED_TRACE(description.str());
|
||||
|
||||
// Subtract reorders count to handle execution in different layouts when input/output reorders can be added in the graph
|
||||
ASSERT_EQ(fused.get_executed_primitives().size() - reorders_count_fused, p.expected_fused_primitives);
|
||||
ASSERT_EQ(not_fused.get_executed_primitives().size() - reorders_count_not_fused, p.expected_not_fused_primitives);
|
||||
ASSERT_EQ(fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_fused), p.expected_fused_primitives);
|
||||
ASSERT_EQ(not_fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_not_fused), p.expected_not_fused_primitives);
|
||||
ASSERT_EQ(outputs_ref.size(), outputs_fused.size());
|
||||
ASSERT_EQ(outputs_ref.size(), size_t(1));
|
||||
|
||||
@ -6362,6 +6362,84 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv,
|
||||
permute_params{CASE_PERMUTE_TILE_8x8_4x4_6D_3, 2, 7},
|
||||
}), );
|
||||
|
||||
/* ------------------------------------------------------------------------------------------------------------ */
|
||||
/* ---------------------------- PERMUTE FUSE REDUNDANT REORDER cases ------------------------------------------ */
|
||||
/* ------------------------------------------------------------------------------------------------------------ */
|
||||
|
||||
struct permute_reorder_params {
|
||||
tensor in_shape;
|
||||
tensor out_shape;
|
||||
std::vector<uint16_t> permute_order1;
|
||||
std::vector<uint16_t> permute_order2;
|
||||
data_types permute_type;
|
||||
data_types output_type;
|
||||
format permute_format;
|
||||
format output_format;
|
||||
size_t expected_fused_primitives;
|
||||
size_t expected_not_fused_primitives;
|
||||
};
|
||||
|
||||
#define CASE_PERMUTE_REORDER_F32_0 {1, 16, 32, 2}, {1, 16, 32, 2}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F32_1 {2, 16, 16, 16}, {2, 16, 16, 16}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F32_2 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
|
||||
#define CASE_PERMUTE_REORDER_F16_0 {1, 16, 2, 4}, {1, 16, 2, 4}, {0, 2, 1, 3}, {0, 2, 1, 3}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F16_1 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_F16_2 {1, 5, 1, 2, 14}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
|
||||
#define CASE_PERMUTE_REORDER_S8_0 {1, 15, 4, 5}, {1, 15, 4, 5}, {0, 2, 3, 1}, {0, 3, 1, 2}, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_S8_1 {1, 2, 15, 4, 5}, {1, 2, 15, 4, 5}, {0, 2, 4, 1, 3}, {0, 3, 1, 4, 2}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
|
||||
#define CASE_PERMUTE_REORDER_F32_TO_F16_0 {1, 5, 1, 2, 14}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_S8_TO_F32_0 {1, 18, 1, 2, 2}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_U8_TO_F16_0 {1, 17, 1, 2, 7}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
|
||||
class PermuteReorderFusingTest : public ::BaseFusingTest<permute_reorder_params> {
|
||||
public:
|
||||
|
||||
void execute(permute_reorder_params& p) {
|
||||
auto input_prim = get_mem(get_input_layout(p));
|
||||
network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
|
||||
network network_fused(this->engine, this->topology_fused, bo_fused);
|
||||
network_fused.set_input_data("input", input_prim);
|
||||
network_not_fused.set_input_data("input", input_prim);
|
||||
compare(network_not_fused, network_fused, p, true);
|
||||
}
|
||||
|
||||
layout get_input_layout(permute_reorder_params& p) {
|
||||
return layout{ p.permute_type, p.permute_format, p.in_shape, padding{} };
|
||||
}
|
||||
};
|
||||
|
||||
class permute_redundant_reorder : public PermuteReorderFusingTest {};
|
||||
|
||||
TEST_P(permute_redundant_reorder, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
permute("permute1", "input", p.permute_order1),
|
||||
reorder("reorder1", "permute1", p.output_format, p.output_type), // to be fused
|
||||
permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder n
|
||||
);
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_redundant_reorder,
|
||||
::testing::ValuesIn(std::vector<permute_reorder_params> {
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_2, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_2, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_S8_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_S8_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4},
|
||||
}),);
|
||||
|
||||
class NormalizeFusingTest : public ::BaseFusingTest<normalize_test_params> {
|
||||
public:
|
||||
void execute(normalize_test_params& p) {
|
||||
|
@ -517,6 +517,97 @@ TEST(permute_gpu_i64, basic_bfyx_permute_0_1_3_2) {
|
||||
permute_test_with_reorder<data_types::i64>();
|
||||
}
|
||||
|
||||
TEST(permute_fuse_reorder_gpu_f32, basic_b_fs_yx_fsv4_permute_1_8_16_1)
|
||||
{
|
||||
// Input : bfyx:1x32x1x2
|
||||
// Permute1 order : {0, 3, 1, 2}
|
||||
// Permute2 order : {0, 2, 3, 1}
|
||||
|
||||
const auto& engine = get_test_engine();
|
||||
auto input = memory::allocate(engine, { data_types::f32, format::bfyx, {1, 8, 1, 16}});
|
||||
|
||||
std::vector<float> values = {
|
||||
0.0f, 1.0f, 2.0f, 3.0f,
|
||||
4.0f, 5.0f, 6.0f, 7.0f,
|
||||
8.0f, 9.0f, 10.0f, 11.0f,
|
||||
12.0f, 13.0f, 14.0f, 15.0f,
|
||||
16.0f, 17.0f, 18.0f, 19.0f,
|
||||
20.0f, 21.0f, 22.0f, 23.0f,
|
||||
24.0f, 25.0f, 26.0f, 27.0f,
|
||||
28.0f, 29.0f, 30.0f, 31.0f,
|
||||
32.0f, 33.0f, 34.0f, 35.0f,
|
||||
36.0f, 37.0f, 38.0f, 39.0f,
|
||||
40.0f, 41.0f, 42.0f, 43.0f,
|
||||
44.0f, 45.0f, 46.0f, 47.0f,
|
||||
48.0f, 49.0f, 50.0f, 51.0f,
|
||||
52.0f, 53.0f, 54.0f, 55.0f,
|
||||
56.0f, 57.0f, 58.0f, 59.0f,
|
||||
60.0f, 61.0f, 62.0f, 63.0f,
|
||||
64.0f, 65.0f, 66.0f, 67.0f,
|
||||
68.0f, 69.0f, 70.0f, 71.0f,
|
||||
72.0f, 73.0f, 74.0f, 75.0f,
|
||||
76.0f, 77.0f, 78.0f, 79.0f,
|
||||
80.0f, 81.0f, 82.0f, 83.0f,
|
||||
84.0f, 85.0f, 86.0f, 87.0f,
|
||||
88.0f, 89.0f, 90.0f, 91.0f,
|
||||
92.0f, 93.0f, 94.0f, 95.0f,
|
||||
96.0f, 97.0f, 98.0f, 99.0f,
|
||||
100.0f, 101.0f, 102.0f, 103.0f,
|
||||
104.0f, 105.0f, 106.0f, 107.0f,
|
||||
108.0f, 109.0f, 110.0f, 111.0f,
|
||||
112.0f, 113.0f, 114.0f, 115.0f,
|
||||
116.0f, 117.0f, 118.0f, 119.0f,
|
||||
120.0f, 121.0f, 122.0f, 123.0f,
|
||||
124.0f, 125.0f, 126.0f, 127.0f
|
||||
};
|
||||
|
||||
set_values(input, values);
|
||||
// unfused
|
||||
topology topology_unfused(
|
||||
input_layout("input", input.get_layout()),
|
||||
reorder("reorder1", "input", format::b_fs_yx_fsv4, data_types::f32),
|
||||
permute("permute", "reorder1", { 0, 3, 1, 2}),
|
||||
reorder("reorder2", "permute", format::bfyx, data_types::f32),
|
||||
permute("out", "reorder2", { 0, 2, 3, 1}));
|
||||
|
||||
cldnn::build_options options_unfused;
|
||||
options_unfused.set_option(cldnn::build_option::optimize_data(false));
|
||||
options_unfused.set_option(cldnn::build_option::allow_static_input_reorder(true));
|
||||
|
||||
network unfused(engine, topology_unfused, options_unfused);
|
||||
unfused.set_input_data("input", input);
|
||||
|
||||
// fused network
|
||||
topology topology_fused(
|
||||
input_layout("input", input.get_layout()),
|
||||
reorder("reorder1", "input", format::b_fs_yx_fsv4, data_types::f32),
|
||||
permute("permute", "reorder1", { 0, 3, 1, 2}),
|
||||
reorder("reorder2", "permute", format::bfyx, data_types::f32), // to be fused to previous permute
|
||||
permute("out", "reorder2", { 0, 2, 3, 1})); // return to original value
|
||||
|
||||
cldnn::build_options options_fused;
|
||||
options_fused.set_option(cldnn::build_option::optimize_data(true));
|
||||
network fused(engine, topology_fused, options_fused);
|
||||
fused.set_input_data("input", input);
|
||||
|
||||
auto outputs_fused = fused.execute();
|
||||
auto outputs_unfused = unfused.execute();
|
||||
auto output_fused = outputs_fused.begin()->second.get_memory();
|
||||
auto output_fused_ptr = output_fused.pointer<float>();
|
||||
auto output_unfused = outputs_unfused.begin()->second.get_memory();
|
||||
auto output_unfused_ptr = output_unfused.pointer<float>();
|
||||
EXPECT_EQ(output_fused.get_layout().format, cldnn::format::bfyx);
|
||||
EXPECT_EQ(output_unfused.get_layout().format, cldnn::format::bfyx);
|
||||
EXPECT_EQ(fused.get_executed_primitives().size(), 4);
|
||||
EXPECT_EQ(unfused.get_executed_primitives().size(), 5);
|
||||
|
||||
for (size_t i = 0; i < values.size(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_EQ(output_unfused_ptr[i], output_fused_ptr[i]);
|
||||
EXPECT_FLOAT_EQ(output_unfused_ptr[i], values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(fc_permute_crop_gpu, basic_permute_yxfb)
|
||||
{
|
||||
const auto& engine = get_test_engine();
|
||||
|
Loading…
Reference in New Issue
Block a user