Optimized permute kernel cannot be fused with both eltwise + reorder. (#15542)
This commit is contained in:
parent
ac1e885324
commit
b9107ac7ca
@ -187,7 +187,7 @@ JitConstants PermuteKernel_tile_8x8_4x4::GetJitConstants(const permute_params& p
|
|||||||
jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", (tile_size / vector_width) * tile_size * total_lws) );
|
jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", (tile_size / vector_width) * tile_size * total_lws) );
|
||||||
|
|
||||||
if (!params.fused_ops.empty()) {
|
if (!params.fused_ops.empty()) {
|
||||||
std::vector<std::string> output_order = GetFusedOpOrderVector(params.outputs[0].GetDims().size());
|
std::vector<std::string> output_order = GetFusedOpOrderVector(params.inputs[0].GetDims().size());
|
||||||
FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
|
FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
|
||||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
|
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
|
||||||
}
|
}
|
||||||
@ -248,6 +248,12 @@ CommonDispatchData PermuteKernel_tile_8x8_4x4::SetDefault(const permute_params&
|
|||||||
bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params& o) const {
|
bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params& o) const {
|
||||||
if (!Parent::Validate(p, o)) return false;
|
if (!Parent::Validate(p, o)) return false;
|
||||||
|
|
||||||
|
const permute_params& params = static_cast<const permute_params&>(p);
|
||||||
|
|
||||||
|
if (params.outputs[0].PitchesDifferFromLogicalDims() || params.inputs[0].PitchesDifferFromLogicalDims()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
std::function<bool(const std::vector<uint16_t>&)> is_rotating_except_batch = [](const std::vector<uint16_t>& order) {
|
std::function<bool(const std::vector<uint16_t>&)> is_rotating_except_batch = [](const std::vector<uint16_t>& order) {
|
||||||
// Target transform: Rotate feature dim to back to be taken as inner-most axis
|
// Target transform: Rotate feature dim to back to be taken as inner-most axis
|
||||||
// ex) 0(b), 4(f), 1(z), 2(y), 3(x)
|
// ex) 0(b), 4(f), 1(z), 2(y), 3(x)
|
||||||
@ -260,15 +266,22 @@ bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
const permute_params& params = static_cast<const permute_params&>(p);
|
|
||||||
|
|
||||||
if (!is_rotating_except_batch(params.order)) {
|
if (!is_rotating_except_batch(params.order)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.outputs[0].PitchesDifferFromLogicalDims() || params.inputs[0].PitchesDifferFromLogicalDims()) {
|
std::function<bool(const permute_params&)> has_fused_op = [] (const permute_params& params) {
|
||||||
|
if (!params.fused_ops.empty()) {
|
||||||
|
for (auto f : params.fused_ops) {
|
||||||
|
if (f.GetType() != KernelType::REORDER)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (has_fused_op(params) && params.inputs[0].GetDims().size() != params.outputs[0].GetDims().size())
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -79,6 +79,16 @@ public:
|
|||||||
layout get_input_layout(permute_reorder_params& p) {
|
layout get_input_layout(permute_reorder_params& p) {
|
||||||
return layout{ p.permute_type, p.permute_format, p.in_shape, padding{} };
|
return layout{ p.permute_type, p.permute_format, p.in_shape, padding{} };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
layout get_elt_input_layout(permute_reorder_params&p) {
|
||||||
|
ov::Shape output_shape;
|
||||||
|
auto input_shape = get_input_layout(p).get_dims();
|
||||||
|
for (int32_t o = 0; o < static_cast<int32_t>(p.permute_order1.size()); ++o) {
|
||||||
|
output_shape.push_back(input_shape[p.permute_order1[o]]);
|
||||||
|
}
|
||||||
|
return layout{ ov::PartialShape(output_shape), p.permute_type, p.permute_format, padding{} };
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
@ -500,6 +510,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::t
|
|||||||
#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
|
#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
|
||||||
#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2 }, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
|
#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2 }, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
|
||||||
#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 2, 3, 1 }, { 0, 5, 1, 4, 2, 3 }, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx
|
#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 2, 3, 1 }, { 0, 5, 1, 4, 2, 3 }, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx
|
||||||
|
#define CASE_PERMUTE_REORDER_TILED_F16_12 { 1, 768, 32, 32 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2}, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
|
||||||
|
|
||||||
class permute_redundant_reorder : public PermuteReorderFusingTest {};
|
class permute_redundant_reorder : public PermuteReorderFusingTest {};
|
||||||
TEST_P(permute_redundant_reorder, basic) {
|
TEST_P(permute_redundant_reorder, basic) {
|
||||||
@ -590,3 +601,30 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_act_reorder, ::testing::ValuesIn(s
|
|||||||
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5 },
|
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5 },
|
||||||
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5 },
|
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5 },
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
class permute_eltwise_reorder : public PermuteReorderFusingTest {};
|
||||||
|
|
||||||
|
TEST_P(permute_eltwise_reorder, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(
|
||||||
|
input_layout("input", get_input_layout(p)),
|
||||||
|
data("elt_data", get_mem(get_elt_input_layout(p))),
|
||||||
|
permute("permute1", input_info("input"), p.permute_order1),
|
||||||
|
eltwise("elt", { input_info("permute1"), input_info("elt_data") }, eltwise_mode::sum, p.permute_type),
|
||||||
|
reorder("reorder1", input_info("elt"), p.output_format, p.output_type), // to be fused to prev permute
|
||||||
|
permute("permute2", input_info("reorder1"), p.permute_order2) // dummy last op to make reorder fused
|
||||||
|
);
|
||||||
|
|
||||||
|
tolerance = 1e-5f;
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tiled opt kernel should not be fused with eltwise + reorder. Currently permute_ref will be selected and fused with eltwise + reorder
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_eltwise_reorder, ::testing::ValuesIn(std::vector<permute_reorder_params>{
|
||||||
|
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_7, 3, 5 },
|
||||||
|
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_8, 3, 5 },
|
||||||
|
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_9, 3, 5 },
|
||||||
|
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5 },
|
||||||
|
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5 },
|
||||||
|
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_12, 3, 5 },
|
||||||
|
}));
|
||||||
|
Loading…
Reference in New Issue
Block a user