[GPU] DG2 layout optimizer improvements (#8074)
* [GPU] DG2 layout optimizer improvements * Fix FP16/FP32 regressions and optimize first qunatize/reorder
This commit is contained in:
parent
8b0bec4e04
commit
e6f0b8fe88
@ -187,12 +187,20 @@ void prepare_primitive_fusing::fuse_reorders(program &p) {
|
||||
void prepare_primitive_fusing::fuse_activations(program &p) {
|
||||
bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
|
||||
std::map<primitive_id, std::vector<primitive_id>> fusing_history;
|
||||
bool use_onednn_impls = false;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
auto& engine = p.get_engine();
|
||||
if (engine.get_device_info().supports_immad && engine.configuration().queue_type == queue_types::in_order)
|
||||
use_onednn_impls = true;
|
||||
#endif
|
||||
|
||||
auto itr = p.get_processing_order().begin();
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
auto node_itr = itr++;
|
||||
auto& node = (*node_itr);
|
||||
|
||||
program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history](activation_node& node) {
|
||||
program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history, &use_onednn_impls](activation_node& node) {
|
||||
auto& input = node.input();
|
||||
auto id = node.id();
|
||||
// Restrictions:
|
||||
@ -239,6 +247,9 @@ void prepare_primitive_fusing::fuse_activations(program &p) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (input.is_type<reshape>() && use_onednn_impls)
|
||||
return;
|
||||
|
||||
if (input.get_fused_primitives().empty()) {
|
||||
input.add_fused_activation(node.get_primitive()->activation_function, node.get_primitive()->additional_params);
|
||||
for (size_t i = 0; i < node.get_fused_activations_funcs().size(); i++) {
|
||||
|
@ -56,7 +56,12 @@ void remove_redundant_reorders::run(program& p) {
|
||||
|
||||
// Avoid different data types between input and output
|
||||
auto same_data_type = input.get_output_layout().data_type == output_layout.data_type;
|
||||
if (!same_data_type)
|
||||
auto i8_u8_input = input.get_output_layout().data_type == data_types::i8 ||
|
||||
input.get_output_layout().data_type == data_types::u8;
|
||||
auto quantize_user = node.get_users().front()->is_type<quantize>() &&
|
||||
node.get_users().size() == 1;
|
||||
|
||||
if (!same_data_type && !(i8_u8_input && quantize_user))
|
||||
continue;
|
||||
|
||||
// Avoid optimization of nv12 reorder
|
||||
@ -323,7 +328,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
auto& dep = node_ptr->get_dependency(0);
|
||||
if (!usr->is_type<quantize>() ||
|
||||
(dep.get_output_layout().format != format::b_fs_yx_fsv16 &&
|
||||
dep.get_output_layout().format != format::fs_b_yx_fsv32 &&
|
||||
(lo.get_optimization_attributes().use_onednn_impls || dep.get_output_layout().format != format::fs_b_yx_fsv32) &&
|
||||
dep.get_output_layout().format != format::bfyx))
|
||||
continue;
|
||||
|
||||
@ -348,7 +353,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
auto& dep = node->get_dependency(0);
|
||||
|
||||
if (!(usr->is_type<convolution>()) ||
|
||||
usr->get_output_layout().data_type != dep.get_output_layout().data_type ||
|
||||
node->get_output_layout().data_type != dep.get_output_layout().data_type ||
|
||||
dep.get_output_layout().format != format::bfyx)
|
||||
return false;
|
||||
if (usr->as<convolution>().get_preferred_impl_type() != impl_types::onednn &&
|
||||
|
@ -178,6 +178,8 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
auto prev_output_layout = prev.get_output_layout();
|
||||
auto next_output_layout = next.get_output_layout();
|
||||
auto prev_dt = prev.get_output_layout().data_type;
|
||||
auto next_dt = next.get_output_layout().data_type;
|
||||
auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
||||
|
||||
auto is_input_idx = [&](size_t idx) -> bool {
|
||||
if (&next.get_dependency(idx) == &prev)
|
||||
@ -191,7 +193,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
return true;
|
||||
|
||||
if (next.is_type<pooling>() &&
|
||||
((prev_simple && next_simple) ||
|
||||
(((prev_simple && next_simple) && (prev_dt == next_dt)) ||
|
||||
((fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::bfyx) && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))
|
||||
return true;
|
||||
|
||||
@ -223,9 +225,12 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
|
||||
// Additional check: fmt_prev == fmt_next is added only when onednn is enabled.
|
||||
if (next.is_type<convolution>() &&
|
||||
fmt_prev == format::bfyx && (!get_optimization_attributes().use_onednn_impls || fmt_prev == fmt_next) &&
|
||||
(!use_onednn_impls || fmt_prev == fmt_next) &&
|
||||
(fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) &&
|
||||
((fmt_next == format::fs_b_yx_fsv32 && next.as<convolution>().get_primitive()->groups == 1) ||
|
||||
(fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3 &&
|
||||
(next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8))))
|
||||
@ -239,8 +244,10 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() &&
|
||||
fmt_prev == format::b_fs_yx_fsv4 &&
|
||||
(fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bs_fs_yx_bsv4_fsv4) &&
|
||||
((fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv4_fsv4 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
|
||||
(fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
|
||||
(prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))))
|
||||
return true;
|
||||
@ -263,12 +270,51 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3))
|
||||
return true;
|
||||
|
||||
if (use_onednn_impls) {
|
||||
if (next.is_type<eltwise>() && (fmt_prev == format::bfyx) && (fmt_next == format::bs_fs_yx_bsv4_fsv2) &&
|
||||
prev.is_input() && (prev_dt == data_types::u8 || prev_dt == data_types::i8))
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() && (prev.is_type<eltwise>() || prev.is_type<quantize>()) &&
|
||||
(fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) &&
|
||||
((fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4))))
|
||||
return true;
|
||||
|
||||
if (next.is_type<quantize>())
|
||||
return true;
|
||||
|
||||
if (next.is_type<permute>()) {
|
||||
auto is_rotating_except_batch = [](const std::vector<uint16_t>& order) {
|
||||
// Target transform: Rotate feature dim to back to be taken as inner-most axis
|
||||
// ex) 0(b), 4(f), 1(z), 2(y), 3(x)
|
||||
// ex) 0(b), 3(f), 1(y), 2(x)
|
||||
if ((int32_t) order[1] != order.size() - 1) return false;
|
||||
if ((int32_t) order[0] != 0) return false;
|
||||
for (int32_t i = 2; i < (int32_t) order.size(); ++i) {
|
||||
if ((int32_t)order[i] != (i - 1)) return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto& permute_order = next.as<permute>().get_primitive()->permute_order;
|
||||
if ((fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::b_fs_yx_fsv32 || fmt_prev == format::b_fs_zyx_fsv32 ||
|
||||
fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_zyx_fsv16 || fmt_prev == format::bs_fs_yx_bsv16_fsv16)
|
||||
&& permute_order[1] == 2
|
||||
&& (!is_rotating_except_batch(permute_order))) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node& next, format fmt_prev, format fmt_next) {
|
||||
auto dt_prev = prev.get_output_layout().data_type;
|
||||
auto dt_next = next.get_output_layout().data_type;
|
||||
auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
||||
|
||||
if (prev.is_type<reorder>())
|
||||
return true;
|
||||
@ -283,7 +329,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
|
||||
return true;
|
||||
|
||||
if (prev.is_type<quantize>() &&
|
||||
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 ||
|
||||
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_zyx_fsv32 || (fmt_next == format::b_fs_yx_fsv32 && !use_onednn_impls) ||
|
||||
fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
|
||||
return true;
|
||||
|
||||
@ -310,6 +356,11 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
|
||||
return true;
|
||||
}
|
||||
|
||||
if (use_onednn_impls) {
|
||||
if (prev.is_type<convolution>() && fmt_next == format::bs_fs_yx_bsv32_fsv16 && fmt_prev == format::bs_fs_yx_bsv4_fsv2)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1270,17 +1321,60 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
node.as<detection_output>(),
|
||||
layout{ data_types::f32, format::bfyx, tensor{} }).format;
|
||||
} else if (node.is_type<quantize>()) {
|
||||
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
||||
auto layout = node.get_output_layout();
|
||||
if (layout.format.spatial_num() == 2 &&
|
||||
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
|
||||
layout.size.batch[0] % 16 == 0) {
|
||||
if (layout.size.feature[0] > 8) {
|
||||
if (use_onednn_impls && layout.size.batch[0] % 32 == 0) {
|
||||
if (node.get_users().size() == 1 && node.get_users().front()->is_type<convolution>()) {
|
||||
auto& conv = node.get_users().front()->as<convolution>();
|
||||
auto ws = conv.get_dependency(1).get_output_layout().size;
|
||||
if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1)
|
||||
expected = format::bfyx;
|
||||
else
|
||||
expected = format::bs_fs_yx_bsv4_fsv4;
|
||||
|
||||
auto conv_output_layout = conv.get_output_layout();
|
||||
auto weights_layout = conv.weights(0).get_output_layout();
|
||||
format expected_conv_fmt = get_expected_layout(conv_output_layout, conv, weights_layout).format;
|
||||
if (expected == format::bfyx && expected_conv_fmt == format::bs_fs_yx_bsv32_fsv32 &&
|
||||
layout.size.feature[0] % 32 == 0) {
|
||||
expected = expected_conv_fmt;
|
||||
}
|
||||
} else {
|
||||
expected = format::bfyx;
|
||||
}
|
||||
} else if (layout.size.feature[0] > 8) {
|
||||
expected = format::b_fs_yx_fsv16;
|
||||
} else {
|
||||
expected = format::b_fs_yx_fsv4;
|
||||
if (use_onednn_impls && layout.size.batch[0] % 16 == 0 && node.get_users().front()->is_type<convolution>()) {
|
||||
auto& conv = node.get_users().front()->as<convolution>();
|
||||
auto ws = conv.get_dependency(1).get_output_layout().size;
|
||||
if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1)
|
||||
expected = format::bfyx;
|
||||
else
|
||||
expected = format::bs_fs_yx_bsv4_fsv4;
|
||||
} else {
|
||||
expected = format::b_fs_yx_fsv4;
|
||||
}
|
||||
}
|
||||
} else if (layout.format.spatial_num() == 3 && (layout.data_type == data_types::i8 || layout.data_type == data_types::u8)) {
|
||||
expected = format::b_fs_zyx_fsv16;
|
||||
} else if (use_onednn_impls) {
|
||||
if (node.get_users().size() == 1 && node.get_users().front()->is_type<convolution>() &&
|
||||
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8)) {
|
||||
auto& conv = node.get_users().front()->as<convolution>();
|
||||
auto conv_output_layout = conv.get_output_layout();
|
||||
auto weights_layout = conv.weights(0).get_output_layout();
|
||||
format expected_conv_fmt = get_expected_layout(conv_output_layout, conv, weights_layout).format;
|
||||
if (layout.format.spatial_num() == 2 && layout.size.feature[0] % 32 == 0 && expected_conv_fmt == format::b_fs_yx_fsv32)
|
||||
expected = expected_conv_fmt;
|
||||
else
|
||||
expected = format::bfyx;
|
||||
} else {
|
||||
expected = format::bfyx;
|
||||
}
|
||||
}
|
||||
} else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
|
||||
expected = node.get_output_layout().format;
|
||||
|
Loading…
Reference in New Issue
Block a user