[GPU] Optimize permute for acdb format (#15139)
* [GPU] Optimize permute for acdb format Target subgraphs to be optimized-out - input(bfyx) - permute(byxf) - conv - conv(byxf) - permute(bfyx) - output + Fix test_device_mem_usage_estimation unit test failed.
This commit is contained in:
parent
1ae0b2796e
commit
06063201d5
@ -35,6 +35,20 @@ public:
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_reverse_rotating_except_batch() const {
|
||||
// Target transform: Rotate feature dim to front to be taken as second outer axis
|
||||
// ex) 0(b), 4(f), 1(x), 2(y), 3(z)
|
||||
// ex) 0(b), 3(f), 1(x), 2(y)
|
||||
auto& order = get_primitive()->permute_order;
|
||||
if ((int32_t) order[order.size() - 2] != 1) return false;
|
||||
if ((int32_t) order[0] != 0) return false;
|
||||
for (int32_t i = 2; i < (int32_t) order.size(); ++i) {
|
||||
if ((int32_t)order[i] != i - 1) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<size_t> get_shape_infer_dependencies() const override { return {}; }
|
||||
};
|
||||
|
||||
@ -53,6 +67,11 @@ public:
|
||||
|
||||
public:
|
||||
typed_primitive_inst(network& network, permute_node const& node);
|
||||
void update_output_memory() override;
|
||||
|
||||
private:
|
||||
void on_execute() override;
|
||||
void reuse_input();
|
||||
};
|
||||
|
||||
using permute_inst = typed_primitive_inst<permute>;
|
||||
|
@ -1872,7 +1872,28 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
|
||||
// WA: shallow convolution needs to set input format by bfyx.
|
||||
// onednn recommended byxf for input format. It will insert reorder before shallow conv.
|
||||
if (node.is_type<convolution>() && node.get_input_layouts()[0].feature() == 3) {
|
||||
src_fmt = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false);
|
||||
bool can_optimize_permute = false;
|
||||
// In permute-conv pattern, check if permute can be optimized
|
||||
// when the input memory of permute has been aligned like byxf format.
|
||||
// ex) pattern: input (bfyx) -> permute (byxf) -> oneDNN convolution
|
||||
// input layout of permute: bfyx [b:1, f:416, y:416, x:3]
|
||||
// output layout of permute: byxf [b:1, f:3, y:416, x:416]
|
||||
// In this case, it can be handled by changing only the shape of permute without the kernel execution.
|
||||
if (node.get_output_layout().get_rank() == 4 && node.get_dependency(0).is_type<permute>()) {
|
||||
auto& pnode = node.get_dependency(0).as<permute>();
|
||||
can_optimize_permute = pnode.get_users().size() == 1 && pnode.get_dependencies().size() == 1
|
||||
&& !pnode.is_output() && pnode.get_dependency(0).get_output_layout().is_static()
|
||||
&& pnode.is_reverse_rotating_except_batch();
|
||||
}
|
||||
if (!can_optimize_permute) {
|
||||
src_fmt = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false);
|
||||
} else {
|
||||
// The size of dependencies and users must each be 1.
|
||||
// In permute-conv pattern, the preferred format of permute should follow previous node.
|
||||
node.get_dependency(0).init_preferred_fmt(1, 1);
|
||||
node.get_dependency(0).set_preferred_input_fmt(0, format::bfyx);
|
||||
node.get_dependency(0).can_be_optimized(true);
|
||||
}
|
||||
}
|
||||
|
||||
node.set_preferred_input_fmt(idx, src_fmt);
|
||||
@ -1887,6 +1908,26 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
|
||||
}
|
||||
}
|
||||
|
||||
// In conv-permute pattern, sets the output format of conv to byxf so that permute can be optimized.
|
||||
// ex) oneDNN convolution -> (byxf) -> permute -> (bfyx) -> output
|
||||
// output layout of convolution: byxf [b:1, f:128, y:2, x:2]
|
||||
// output layout of permute: bfyx [b:1, f:2, y:2, x:128]
|
||||
// In this case, it can be handled by changing only the shape of permute without the kernel execution.
|
||||
if (node.get_output_layout().get_rank() == 4
|
||||
&& node.get_users().size() == 1 && node.get_users().front()->is_type<permute>()) {
|
||||
auto& pnode = node.get_users().front()->as<permute>();
|
||||
auto can_optimize_permute = pnode.get_dependencies().size() == 1
|
||||
&& !pnode.is_output() && pnode.get_dependency(0).get_output_layout().is_static()
|
||||
&& pnode.is_rotating_except_batch();
|
||||
if (can_optimize_permute) {
|
||||
dst_fmt = format::byxf;
|
||||
pnode.init_preferred_fmt(1, 1);
|
||||
pnode.set_preferred_input_fmt(0, cldnn::format::byxf);
|
||||
pnode.set_preferred_output_fmt(0, cldnn::format::bfyx);
|
||||
pnode.can_be_optimized(true);
|
||||
}
|
||||
}
|
||||
|
||||
if (node.get_preferred_output_fmt() == format::any) {
|
||||
for (size_t usr = 0; usr < std::max<size_t>(1, node.get_users().size()); usr++)
|
||||
node.set_preferred_output_fmt(usr, dst_fmt);
|
||||
|
@ -38,7 +38,13 @@ layout permute_inst::calc_output_layout(permute_node const& node, kernel_impl_pa
|
||||
input_layout.data_type = impl_param.get_fused_output_layout().data_type;
|
||||
}
|
||||
|
||||
return layout(input_layout.data_type, input_layout.format, output_size, op);
|
||||
// Adjust output format for optimizing out of transpose related to acdb format.
|
||||
auto out_fmt = input_layout.format;
|
||||
if (node.get_preferred_output_fmt() != format::any) {
|
||||
out_fmt = node.get_preferred_output_fmt();
|
||||
}
|
||||
|
||||
return layout(input_layout.data_type, out_fmt, output_size, op);
|
||||
}
|
||||
|
||||
template<typename ShapeType>
|
||||
@ -101,7 +107,8 @@ std::string permute_inst::to_string(permute_node const& node) {
|
||||
return primitive_description.str();
|
||||
}
|
||||
|
||||
permute_inst::typed_primitive_inst(network& network, permute_node const& node) : parent(network, node) {
|
||||
permute_inst::typed_primitive_inst(network& network, permute_node const& node) :
|
||||
parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false) {
|
||||
auto permute_order = argument->permute_order;
|
||||
|
||||
auto required_order_values_size = static_cast<uint32_t>(permute_order.size());
|
||||
@ -110,5 +117,36 @@ permute_inst::typed_primitive_inst(network& network, permute_node const& node) :
|
||||
if (!(std::find(permute_order.begin(), permute_order.end(), i) != permute_order.end()))
|
||||
CLDNN_ERROR_MESSAGE(node.id(), "Permute order does not contain all of required values.");
|
||||
}
|
||||
|
||||
if (node.can_be_optimized()) {
|
||||
reuse_input();
|
||||
}
|
||||
}
|
||||
|
||||
void permute_inst::on_execute() {
|
||||
if (can_be_optimized())
|
||||
reuse_input();
|
||||
}
|
||||
|
||||
void permute_inst::reuse_input() {
|
||||
update_output_memory();
|
||||
}
|
||||
|
||||
void permute_inst::update_output_memory() {
|
||||
if (!can_be_optimized())
|
||||
return;
|
||||
|
||||
if (_outputs.size() > 0 && static_cast<bool>(_outputs[0])
|
||||
&& _network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
|
||||
return;
|
||||
|
||||
if (_node != nullptr)
|
||||
build_deps();
|
||||
|
||||
_outputs = {_network.get_engine().reinterpret_buffer(input_memory(), _impl_params->get_output_layout())};
|
||||
_mem_allocated = false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace cldnn
|
||||
|
@ -4054,4 +4054,124 @@ INSTANTIATE_TEST_SUITE_P(implicit_crop_concat_conv_fusings_gpu, implicit_crop_co
|
||||
implicit_crop_concat_convolution_test_params{ CASE_CROP_FQ_CONCAT_1, 5, 9 },
|
||||
}));
|
||||
|
||||
|
||||
class PermuteOptimizingTestOnednn : public BaseFusingTest<convolution_test_params> {
|
||||
public:
|
||||
void execute(convolution_test_params& p) {
|
||||
if (!engine.get_device_info().supports_immad)
|
||||
return;
|
||||
|
||||
p.expected_fused_primitives = p.expected_fused_primitives_onednn;
|
||||
|
||||
cldnn::memory::ptr input_prim = get_mem(get_input_layout(p));
|
||||
cfg_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
|
||||
cfg_not_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
|
||||
|
||||
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
|
||||
network network_fused(this->engine, this->topology_fused, cfg_fused);
|
||||
network_fused.set_input_data("input", input_prim);
|
||||
network_not_fused.set_input_data("input", input_prim);
|
||||
|
||||
compare(network_not_fused, network_fused, p);
|
||||
auto find_conv = [](primitive_info& p) -> bool {
|
||||
if (p.original_id == "conv_prim")
|
||||
return true;
|
||||
return false;
|
||||
};
|
||||
|
||||
auto pi_fused = network_fused.get_primitives_info();
|
||||
auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv);
|
||||
if (info_fused != pi_fused.end())
|
||||
std::cout << "kernel: " << info_fused->kernel_id << std::endl;
|
||||
|
||||
auto permute_prim = std::find_if(pi_fused.begin(), pi_fused.end(), [](primitive_info& p) -> bool {
|
||||
if (p.original_id == "permute")
|
||||
return true;
|
||||
return false;
|
||||
});
|
||||
|
||||
ASSERT_TRUE(permute_prim != pi_fused.end());
|
||||
ASSERT_TRUE(permute_prim->kernel_id == "undef");
|
||||
}
|
||||
|
||||
layout get_input_layout(convolution_test_params& p) {
|
||||
auto pad = p.pad;
|
||||
std::vector<int> pad_ = { 0, 0, static_cast<int>(pad[1]), static_cast<int>(pad[0]) };
|
||||
return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
|
||||
}
|
||||
|
||||
layout get_per_channel_layout(convolution_test_params& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
|
||||
}
|
||||
|
||||
layout get_prelu_slope_layout(convolution_test_params& p) {
|
||||
return layout{ p.default_type, p.input_format, tensor{1, p.out_shape.feature[0], p.out_shape.spatial[0], 1} };
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#define CASE_CONV_FP16_PERMUTE_1 { 1, 4, 3, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx
|
||||
#define CASE_CONV_FP16_PERMUTE_2 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx
|
||||
|
||||
class conv_after_permute_optimizing : public PermuteOptimizingTestOnednn {};
|
||||
TEST_P(conv_after_permute_optimizing, basic) {
|
||||
if (!engine.get_device_info().supports_immad)
|
||||
return;
|
||||
|
||||
auto p = GetParam();
|
||||
|
||||
auto weights_layout = cldnn::layout { p.weights_type, p.weights_format,
|
||||
cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.spatial[0]),
|
||||
spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])) };
|
||||
|
||||
auto bias_layout = cldnn::layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.feature[0], 1, 1} };
|
||||
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(weights_layout)),
|
||||
data("bias", get_mem(bias_layout)),
|
||||
permute("permute", input_info("input"), {0, 3, 1, 2}),
|
||||
convolution("conv_prim", input_info("permute"), { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||
activation("activation", input_info("conv_prim"), activation_func::abs),
|
||||
reorder("reorder_bfyx", input_info("activation"), p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
tolerance = default_tolerance(p.default_type);
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_after_permute_optimizing, ::testing::ValuesIn(std::vector<convolution_test_params>{
|
||||
convolution_test_params{ CASE_CONV_FP16_PERMUTE_1, 3, 2, 4 },
|
||||
}));
|
||||
|
||||
class conv_before_permute_optimizing : public PermuteOptimizingTestOnednn {};
|
||||
TEST_P(conv_before_permute_optimizing, basic) {
|
||||
if (!engine.get_device_info().supports_immad)
|
||||
return;
|
||||
|
||||
auto p = GetParam();
|
||||
|
||||
ov::intel_gpu::ImplementationDesc conv_impl = { cldnn::format::type::any, "", impl_types::onednn };
|
||||
cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv_prim", conv_impl } }));
|
||||
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
convolution("conv_prim", input_info("input"), { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||
activation("activation", input_info("conv_prim"), activation_func::abs),
|
||||
permute("permute", input_info("activation"), {0, 2, 3, 1}),
|
||||
reorder("reorder_bfyx", input_info("permute"), p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
tolerance = default_tolerance(p.default_type);
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_before_permute_optimizing, ::testing::ValuesIn(std::vector<convolution_test_params>{
|
||||
convolution_test_params{ CASE_CONV_FP16_PERMUTE_2, 3, 2, 4 },
|
||||
}));
|
||||
|
||||
|
||||
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
Loading…
Reference in New Issue
Block a user