[GPU] Optimize permute for acdb format (#15139)

* [GPU] Optimize permute for acdb format

Target subgraphs to be optimized-out
- input(bfyx) - permute(byxf) - conv
- conv(byxf) - permute(bfyx) - output
+ Fix test_device_mem_usage_estimation unit test failed.
This commit is contained in:
Jade Cho 2023-01-31 17:32:57 +09:00 committed by GitHub
parent 1ae0b2796e
commit 06063201d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 221 additions and 3 deletions

View File

@ -35,6 +35,20 @@ public:
}
return true;
}
bool is_reverse_rotating_except_batch() const {
// Target transform: Rotate feature dim to front to be taken as second outer axis
// ex) 0(b), 4(f), 1(x), 2(y), 3(z)
// ex) 0(b), 3(f), 1(x), 2(y)
auto& order = get_primitive()->permute_order;
if ((int32_t) order[order.size() - 2] != 1) return false;
if ((int32_t) order[0] != 0) return false;
for (int32_t i = 2; i < (int32_t) order.size(); ++i) {
if ((int32_t)order[i] != i - 1) return false;
}
return true;
}
std::vector<size_t> get_shape_infer_dependencies() const override { return {}; }
};
@ -53,6 +67,11 @@ public:
public:
typed_primitive_inst(network& network, permute_node const& node);
void update_output_memory() override;
private:
void on_execute() override;
void reuse_input();
};
using permute_inst = typed_primitive_inst<permute>;

View File

@ -1872,7 +1872,28 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
// WA: shallow convolution needs to set input format by bfyx.
// onednn recommended byxf for input format. It will insert reorder before shallow conv.
if (node.is_type<convolution>() && node.get_input_layouts()[0].feature() == 3) {
src_fmt = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false);
bool can_optimize_permute = false;
// In permute-conv pattern, check if permute can be optimized
// when the input memory of permute has been aligned like byxf format.
// ex) pattern: input (bfyx) -> permute (byxf) -> oneDNN convolution
// input layout of permute: bfyx [b:1, f:416, y:416, x:3]
// output layout of permute: byxf [b:1, f:3, y:416, x:416]
// In this case, it can be handled by changing only the shape of permute without the kernel execution.
if (node.get_output_layout().get_rank() == 4 && node.get_dependency(0).is_type<permute>()) {
auto& pnode = node.get_dependency(0).as<permute>();
can_optimize_permute = pnode.get_users().size() == 1 && pnode.get_dependencies().size() == 1
&& !pnode.is_output() && pnode.get_dependency(0).get_output_layout().is_static()
&& pnode.is_reverse_rotating_except_batch();
}
if (!can_optimize_permute) {
src_fmt = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false);
} else {
// The size of dependencies and users must each be 1.
// In permute-conv pattern, the preferred format of permute should follow previous node.
node.get_dependency(0).init_preferred_fmt(1, 1);
node.get_dependency(0).set_preferred_input_fmt(0, format::bfyx);
node.get_dependency(0).can_be_optimized(true);
}
}
node.set_preferred_input_fmt(idx, src_fmt);
@ -1887,6 +1908,26 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
}
}
// In conv-permute pattern, sets the output format of conv to byxf so that permute can be optimized.
// ex) oneDNN convolution -> (byxf) -> permute -> (bfyx) -> output
// output layout of convolution: byxf [b:1, f:128, y:2, x:2]
// output layout of permute: bfyx [b:1, f:2, y:2, x:128]
// In this case, it can be handled by changing only the shape of permute without the kernel execution.
if (node.get_output_layout().get_rank() == 4
&& node.get_users().size() == 1 && node.get_users().front()->is_type<permute>()) {
auto& pnode = node.get_users().front()->as<permute>();
auto can_optimize_permute = pnode.get_dependencies().size() == 1
&& !pnode.is_output() && pnode.get_dependency(0).get_output_layout().is_static()
&& pnode.is_rotating_except_batch();
if (can_optimize_permute) {
dst_fmt = format::byxf;
pnode.init_preferred_fmt(1, 1);
pnode.set_preferred_input_fmt(0, cldnn::format::byxf);
pnode.set_preferred_output_fmt(0, cldnn::format::bfyx);
pnode.can_be_optimized(true);
}
}
if (node.get_preferred_output_fmt() == format::any) {
for (size_t usr = 0; usr < std::max<size_t>(1, node.get_users().size()); usr++)
node.set_preferred_output_fmt(usr, dst_fmt);

View File

@ -38,7 +38,13 @@ layout permute_inst::calc_output_layout(permute_node const& node, kernel_impl_pa
input_layout.data_type = impl_param.get_fused_output_layout().data_type;
}
return layout(input_layout.data_type, input_layout.format, output_size, op);
// Adjust output format for optimizing out of transpose related to acdb format.
auto out_fmt = input_layout.format;
if (node.get_preferred_output_fmt() != format::any) {
out_fmt = node.get_preferred_output_fmt();
}
return layout(input_layout.data_type, out_fmt, output_size, op);
}
template<typename ShapeType>
@ -101,7 +107,8 @@ std::string permute_inst::to_string(permute_node const& node) {
return primitive_description.str();
}
permute_inst::typed_primitive_inst(network& network, permute_node const& node) : parent(network, node) {
permute_inst::typed_primitive_inst(network& network, permute_node const& node) :
parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false) {
auto permute_order = argument->permute_order;
auto required_order_values_size = static_cast<uint32_t>(permute_order.size());
@ -110,5 +117,36 @@ permute_inst::typed_primitive_inst(network& network, permute_node const& node) :
if (!(std::find(permute_order.begin(), permute_order.end(), i) != permute_order.end()))
CLDNN_ERROR_MESSAGE(node.id(), "Permute order does not contain all of required values.");
}
if (node.can_be_optimized()) {
reuse_input();
}
}
void permute_inst::on_execute() {
if (can_be_optimized())
reuse_input();
}
void permute_inst::reuse_input() {
update_output_memory();
}
void permute_inst::update_output_memory() {
if (!can_be_optimized())
return;
if (_outputs.size() > 0 && static_cast<bool>(_outputs[0])
&& _network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
return;
if (_node != nullptr)
build_deps();
_outputs = {_network.get_engine().reinterpret_buffer(input_memory(), _impl_params->get_output_layout())};
_mem_allocated = false;
}
} // namespace cldnn

View File

@ -4054,4 +4054,124 @@ INSTANTIATE_TEST_SUITE_P(implicit_crop_concat_conv_fusings_gpu, implicit_crop_co
implicit_crop_concat_convolution_test_params{ CASE_CROP_FQ_CONCAT_1, 5, 9 },
}));
class PermuteOptimizingTestOnednn : public BaseFusingTest<convolution_test_params> {
public:
void execute(convolution_test_params& p) {
if (!engine.get_device_info().supports_immad)
return;
p.expected_fused_primitives = p.expected_fused_primitives_onednn;
cldnn::memory::ptr input_prim = get_mem(get_input_layout(p));
cfg_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
cfg_not_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
network network_fused(this->engine, this->topology_fused, cfg_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
compare(network_not_fused, network_fused, p);
auto find_conv = [](primitive_info& p) -> bool {
if (p.original_id == "conv_prim")
return true;
return false;
};
auto pi_fused = network_fused.get_primitives_info();
auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv);
if (info_fused != pi_fused.end())
std::cout << "kernel: " << info_fused->kernel_id << std::endl;
auto permute_prim = std::find_if(pi_fused.begin(), pi_fused.end(), [](primitive_info& p) -> bool {
if (p.original_id == "permute")
return true;
return false;
});
ASSERT_TRUE(permute_prim != pi_fused.end());
ASSERT_TRUE(permute_prim->kernel_id == "undef");
}
layout get_input_layout(convolution_test_params& p) {
auto pad = p.pad;
std::vector<int> pad_ = { 0, 0, static_cast<int>(pad[1]), static_cast<int>(pad[0]) };
return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
}
layout get_per_channel_layout(convolution_test_params& p) {
return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
}
layout get_prelu_slope_layout(convolution_test_params& p) {
return layout{ p.default_type, p.input_format, tensor{1, p.out_shape.feature[0], p.out_shape.spatial[0], 1} };
}
};
#define CASE_CONV_FP16_PERMUTE_1 { 1, 4, 3, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx
#define CASE_CONV_FP16_PERMUTE_2 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx
class conv_after_permute_optimizing : public PermuteOptimizingTestOnednn {};
TEST_P(conv_after_permute_optimizing, basic) {
if (!engine.get_device_info().supports_immad)
return;
auto p = GetParam();
auto weights_layout = cldnn::layout { p.weights_type, p.weights_format,
cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.spatial[0]),
spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])) };
auto bias_layout = cldnn::layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.feature[0], 1, 1} };
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(weights_layout)),
data("bias", get_mem(bias_layout)),
permute("permute", input_info("input"), {0, 3, 1, 2}),
convolution("conv_prim", input_info("permute"), { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
activation("activation", input_info("conv_prim"), activation_func::abs),
reorder("reorder_bfyx", input_info("activation"), p.default_format, data_types::f32)
);
tolerance = default_tolerance(p.default_type);
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_after_permute_optimizing, ::testing::ValuesIn(std::vector<convolution_test_params>{
convolution_test_params{ CASE_CONV_FP16_PERMUTE_1, 3, 2, 4 },
}));
class conv_before_permute_optimizing : public PermuteOptimizingTestOnednn {};
TEST_P(conv_before_permute_optimizing, basic) {
if (!engine.get_device_info().supports_immad)
return;
auto p = GetParam();
ov::intel_gpu::ImplementationDesc conv_impl = { cldnn::format::type::any, "", impl_types::onednn };
cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv_prim", conv_impl } }));
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
convolution("conv_prim", input_info("input"), { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
activation("activation", input_info("conv_prim"), activation_func::abs),
permute("permute", input_info("activation"), {0, 2, 3, 1}),
reorder("reorder_bfyx", input_info("permute"), p.default_format, data_types::f32)
);
tolerance = default_tolerance(p.default_type);
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_before_permute_optimizing, ::testing::ValuesIn(std::vector<convolution_test_params>{
convolution_test_params{ CASE_CONV_FP16_PERMUTE_2, 3, 2, 4 },
}));
#endif // ENABLE_ONEDNN_FOR_GPU