[GPU] DG2 layout optimizer improvements (#8074)

* [GPU] DG2 layout optimizer improvements

* Fix FP16/FP32 regressions and optimize first qunatize/reorder
This commit is contained in:
Sergey Shlyapnikov 2021-10-25 10:55:40 +03:00 committed by GitHub
parent 8b0bec4e04
commit e6f0b8fe88
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 120 additions and 10 deletions

View File

@ -187,12 +187,20 @@ void prepare_primitive_fusing::fuse_reorders(program &p) {
void prepare_primitive_fusing::fuse_activations(program &p) {
bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
std::map<primitive_id, std::vector<primitive_id>> fusing_history;
bool use_onednn_impls = false;
#ifdef ENABLE_ONEDNN_FOR_GPU
auto& engine = p.get_engine();
if (engine.get_device_info().supports_immad && engine.configuration().queue_type == queue_types::in_order)
use_onednn_impls = true;
#endif
auto itr = p.get_processing_order().begin();
while (itr != p.get_processing_order().end()) {
auto node_itr = itr++;
auto& node = (*node_itr);
program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history](activation_node& node) {
program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history, &use_onednn_impls](activation_node& node) {
auto& input = node.input();
auto id = node.id();
// Restrictions:
@ -239,6 +247,9 @@ void prepare_primitive_fusing::fuse_activations(program &p) {
return;
}
if (input.is_type<reshape>() && use_onednn_impls)
return;
if (input.get_fused_primitives().empty()) {
input.add_fused_activation(node.get_primitive()->activation_function, node.get_primitive()->additional_params);
for (size_t i = 0; i < node.get_fused_activations_funcs().size(); i++) {

View File

@ -56,7 +56,12 @@ void remove_redundant_reorders::run(program& p) {
// Avoid different data types between input and output
auto same_data_type = input.get_output_layout().data_type == output_layout.data_type;
if (!same_data_type)
auto i8_u8_input = input.get_output_layout().data_type == data_types::i8 ||
input.get_output_layout().data_type == data_types::u8;
auto quantize_user = node.get_users().front()->is_type<quantize>() &&
node.get_users().size() == 1;
if (!same_data_type && !(i8_u8_input && quantize_user))
continue;
// Avoid optimization of nv12 reorder
@ -323,7 +328,7 @@ void remove_redundant_reorders::run(program& p) {
auto& dep = node_ptr->get_dependency(0);
if (!usr->is_type<quantize>() ||
(dep.get_output_layout().format != format::b_fs_yx_fsv16 &&
dep.get_output_layout().format != format::fs_b_yx_fsv32 &&
(lo.get_optimization_attributes().use_onednn_impls || dep.get_output_layout().format != format::fs_b_yx_fsv32) &&
dep.get_output_layout().format != format::bfyx))
continue;
@ -348,7 +353,7 @@ void remove_redundant_reorders::run(program& p) {
auto& dep = node->get_dependency(0);
if (!(usr->is_type<convolution>()) ||
usr->get_output_layout().data_type != dep.get_output_layout().data_type ||
node->get_output_layout().data_type != dep.get_output_layout().data_type ||
dep.get_output_layout().format != format::bfyx)
return false;
if (usr->as<convolution>().get_preferred_impl_type() != impl_types::onednn &&

View File

@ -178,6 +178,8 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
auto prev_output_layout = prev.get_output_layout();
auto next_output_layout = next.get_output_layout();
auto prev_dt = prev.get_output_layout().data_type;
auto next_dt = next.get_output_layout().data_type;
auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
auto is_input_idx = [&](size_t idx) -> bool {
if (&next.get_dependency(idx) == &prev)
@ -191,7 +193,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
return true;
if (next.is_type<pooling>() &&
((prev_simple && next_simple) ||
(((prev_simple && next_simple) && (prev_dt == next_dt)) ||
((fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::bfyx) && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))
return true;
@ -223,9 +225,12 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
// Additional check: fmt_prev == fmt_next is added only when onednn is enabled.
if (next.is_type<convolution>() &&
fmt_prev == format::bfyx && (!get_optimization_attributes().use_onednn_impls || fmt_prev == fmt_next) &&
(!use_onednn_impls || fmt_prev == fmt_next) &&
(fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) &&
((fmt_next == format::fs_b_yx_fsv32 && next.as<convolution>().get_primitive()->groups == 1) ||
(fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
(fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
(fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3) ||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3 &&
(next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8))))
@ -239,8 +244,10 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
return true;
if (next.is_type<convolution>() &&
fmt_prev == format::b_fs_yx_fsv4 &&
(fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bs_fs_yx_bsv4_fsv4) &&
((fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
(fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
(fmt_next == format::bs_fs_yx_bsv4_fsv4 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
(fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
(prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))))
return true;
@ -263,12 +270,51 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3))
return true;
if (use_onednn_impls) {
if (next.is_type<eltwise>() && (fmt_prev == format::bfyx) && (fmt_next == format::bs_fs_yx_bsv4_fsv2) &&
prev.is_input() && (prev_dt == data_types::u8 || prev_dt == data_types::i8))
return true;
if (next.is_type<convolution>() && (prev.is_type<eltwise>() || prev.is_type<quantize>()) &&
(fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) &&
((fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
(fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4))))
return true;
if (next.is_type<quantize>())
return true;
if (next.is_type<permute>()) {
auto is_rotating_except_batch = [](const std::vector<uint16_t>& order) {
// Target transform: Rotate feature dim to back to be taken as inner-most axis
// ex) 0(b), 4(f), 1(z), 2(y), 3(x)
// ex) 0(b), 3(f), 1(y), 2(x)
if ((int32_t) order[1] != order.size() - 1) return false;
if ((int32_t) order[0] != 0) return false;
for (int32_t i = 2; i < (int32_t) order.size(); ++i) {
if ((int32_t)order[i] != (i - 1)) return false;
}
return true;
};
auto& permute_order = next.as<permute>().get_primitive()->permute_order;
if ((fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::b_fs_yx_fsv32 || fmt_prev == format::b_fs_zyx_fsv32 ||
fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_zyx_fsv16 || fmt_prev == format::bs_fs_yx_bsv16_fsv16)
&& permute_order[1] == 2
&& (!is_rotating_except_batch(permute_order))) {
return false;
}
return true;
}
}
return false;
}
bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node& next, format fmt_prev, format fmt_next) {
auto dt_prev = prev.get_output_layout().data_type;
auto dt_next = next.get_output_layout().data_type;
auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
if (prev.is_type<reorder>())
return true;
@ -283,7 +329,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
return true;
if (prev.is_type<quantize>() &&
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 ||
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_zyx_fsv32 || (fmt_next == format::b_fs_yx_fsv32 && !use_onednn_impls) ||
fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
return true;
@ -310,6 +356,11 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
return true;
}
if (use_onednn_impls) {
if (prev.is_type<convolution>() && fmt_next == format::bs_fs_yx_bsv32_fsv16 && fmt_prev == format::bs_fs_yx_bsv4_fsv2)
return true;
}
return false;
}
@ -1270,17 +1321,60 @@ format layout_optimizer::get_preferred_format(program_node& node) {
node.as<detection_output>(),
layout{ data_types::f32, format::bfyx, tensor{} }).format;
} else if (node.is_type<quantize>()) {
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
auto layout = node.get_output_layout();
if (layout.format.spatial_num() == 2 &&
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
layout.size.batch[0] % 16 == 0) {
if (layout.size.feature[0] > 8) {
if (use_onednn_impls && layout.size.batch[0] % 32 == 0) {
if (node.get_users().size() == 1 && node.get_users().front()->is_type<convolution>()) {
auto& conv = node.get_users().front()->as<convolution>();
auto ws = conv.get_dependency(1).get_output_layout().size;
if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1)
expected = format::bfyx;
else
expected = format::bs_fs_yx_bsv4_fsv4;
auto conv_output_layout = conv.get_output_layout();
auto weights_layout = conv.weights(0).get_output_layout();
format expected_conv_fmt = get_expected_layout(conv_output_layout, conv, weights_layout).format;
if (expected == format::bfyx && expected_conv_fmt == format::bs_fs_yx_bsv32_fsv32 &&
layout.size.feature[0] % 32 == 0) {
expected = expected_conv_fmt;
}
} else {
expected = format::bfyx;
}
} else if (layout.size.feature[0] > 8) {
expected = format::b_fs_yx_fsv16;
} else {
if (use_onednn_impls && layout.size.batch[0] % 16 == 0 && node.get_users().front()->is_type<convolution>()) {
auto& conv = node.get_users().front()->as<convolution>();
auto ws = conv.get_dependency(1).get_output_layout().size;
if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1)
expected = format::bfyx;
else
expected = format::bs_fs_yx_bsv4_fsv4;
} else {
expected = format::b_fs_yx_fsv4;
}
}
} else if (layout.format.spatial_num() == 3 && (layout.data_type == data_types::i8 || layout.data_type == data_types::u8)) {
expected = format::b_fs_zyx_fsv16;
} else if (use_onednn_impls) {
if (node.get_users().size() == 1 && node.get_users().front()->is_type<convolution>() &&
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8)) {
auto& conv = node.get_users().front()->as<convolution>();
auto conv_output_layout = conv.get_output_layout();
auto weights_layout = conv.weights(0).get_output_layout();
format expected_conv_fmt = get_expected_layout(conv_output_layout, conv, weights_layout).format;
if (layout.format.spatial_num() == 2 && layout.size.feature[0] % 32 == 0 && expected_conv_fmt == format::b_fs_yx_fsv32)
expected = expected_conv_fmt;
else
expected = format::bfyx;
} else {
expected = format::bfyx;
}
}
} else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
expected = node.get_output_layout().format;