Optimized permute kernel cannot be fused with both eltwise + reorder. (#15542)

This commit is contained in:
Taylor Yeonbok Lee 2023-02-08 10:32:13 -08:00 committed by GitHub
parent ac1e885324
commit b9107ac7ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 5 deletions

View File

@ -187,7 +187,7 @@ JitConstants PermuteKernel_tile_8x8_4x4::GetJitConstants(const permute_params& p
jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", (tile_size / vector_width) * tile_size * total_lws) ); jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", (tile_size / vector_width) * tile_size * total_lws) );
if (!params.fused_ops.empty()) { if (!params.fused_ops.empty()) {
std::vector<std::string> output_order = GetFusedOpOrderVector(params.outputs[0].GetDims().size()); std::vector<std::string> output_order = GetFusedOpOrderVector(params.inputs[0].GetDims().size());
FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1}; FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf})); jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
} }
@ -248,6 +248,12 @@ CommonDispatchData PermuteKernel_tile_8x8_4x4::SetDefault(const permute_params&
bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params& o) const { bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params& o) const {
if (!Parent::Validate(p, o)) return false; if (!Parent::Validate(p, o)) return false;
const permute_params& params = static_cast<const permute_params&>(p);
if (params.outputs[0].PitchesDifferFromLogicalDims() || params.inputs[0].PitchesDifferFromLogicalDims()) {
return false;
}
std::function<bool(const std::vector<uint16_t>&)> is_rotating_except_batch = [](const std::vector<uint16_t>& order) { std::function<bool(const std::vector<uint16_t>&)> is_rotating_except_batch = [](const std::vector<uint16_t>& order) {
// Target transform: Rotate feature dim to back to be taken as inner-most axis // Target transform: Rotate feature dim to back to be taken as inner-most axis
// ex) 0(b), 4(f), 1(z), 2(y), 3(x) // ex) 0(b), 4(f), 1(z), 2(y), 3(x)
@ -260,15 +266,22 @@ bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params
return true; return true;
}; };
const permute_params& params = static_cast<const permute_params&>(p);
if (!is_rotating_except_batch(params.order)) { if (!is_rotating_except_batch(params.order)) {
return false; return false;
} }
if (params.outputs[0].PitchesDifferFromLogicalDims() || params.inputs[0].PitchesDifferFromLogicalDims()) { std::function<bool(const permute_params&)> has_fused_op = [] (const permute_params& params) {
if (!params.fused_ops.empty()) {
for (auto f : params.fused_ops) {
if (f.GetType() != KernelType::REORDER)
return true;
}
}
return false;
};
if (has_fused_op(params) && params.inputs[0].GetDims().size() != params.outputs[0].GetDims().size())
return false; return false;
}
return true; return true;
} }

View File

@ -79,6 +79,16 @@ public:
layout get_input_layout(permute_reorder_params& p) { layout get_input_layout(permute_reorder_params& p) {
return layout{ p.permute_type, p.permute_format, p.in_shape, padding{} }; return layout{ p.permute_type, p.permute_format, p.in_shape, padding{} };
} }
layout get_elt_input_layout(permute_reorder_params&p) {
ov::Shape output_shape;
auto input_shape = get_input_layout(p).get_dims();
for (int32_t o = 0; o < static_cast<int32_t>(p.permute_order1.size()); ++o) {
output_shape.push_back(input_shape[p.permute_order1[o]]);
}
return layout{ ov::PartialShape(output_shape), p.permute_type, p.permute_format, padding{} };
}
}; };
} // namespace } // namespace
@ -500,6 +510,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::t
#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx #define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2 }, data_types::f16, data_types::f32, format::bfyx, format::bfzyx #define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2 }, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 2, 3, 1 }, { 0, 5, 1, 4, 2, 3 }, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx #define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 2, 3, 1 }, { 0, 5, 1, 4, 2, 3 }, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx
#define CASE_PERMUTE_REORDER_TILED_F16_12 { 1, 768, 32, 32 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2}, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
class permute_redundant_reorder : public PermuteReorderFusingTest {}; class permute_redundant_reorder : public PermuteReorderFusingTest {};
TEST_P(permute_redundant_reorder, basic) { TEST_P(permute_redundant_reorder, basic) {
@ -590,3 +601,30 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_act_reorder, ::testing::ValuesIn(s
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5 }, permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5 }, permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5 },
})); }));
class permute_eltwise_reorder : public PermuteReorderFusingTest {};
TEST_P(permute_eltwise_reorder, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("elt_data", get_mem(get_elt_input_layout(p))),
permute("permute1", input_info("input"), p.permute_order1),
eltwise("elt", { input_info("permute1"), input_info("elt_data") }, eltwise_mode::sum, p.permute_type),
reorder("reorder1", input_info("elt"), p.output_format, p.output_type), // to be fused to prev permute
permute("permute2", input_info("reorder1"), p.permute_order2) // dummy last op to make reorder fused
);
tolerance = 1e-5f;
execute(p);
}
// Tiled opt kernel should not be fused with eltwise + reorder. Currently permute_ref will be selected and fused with eltwise + reorder
INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_eltwise_reorder, ::testing::ValuesIn(std::vector<permute_reorder_params>{
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_7, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_8, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_9, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_12, 3, 5 },
}));