[IE CLDNN] Fixed layout optimizer (#4325)

This commit is contained in:
Vladimir Paramuzov
2021-02-17 18:25:16 +03:00
committed by GitHub
parent 695d2a90ba
commit 45ae389842
2 changed files with 82 additions and 74 deletions

View File

@@ -149,24 +149,28 @@ private:
const layout& output_layout,
const layout& weights_layout,
const convolution_node& node);
bool convolution_b_fs_yx_fsv16_opt(const layout &output_layout,
const layout &weights_layout,
std::shared_ptr<const convolution> conv,
bool weak_restrictions = false);
bool convolution_b_fs_zyx_fsv16_opt(const layout &output_layout,
const layout &weights_layout,
std::shared_ptr<const convolution> conv);
bool convolution_bs_fs_yx_bsv16_fsv16_opt(const layout &input_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv);
bool convolution_fs_b_yx_fsv32_opt(const layout& input_layout,
bool convolution_b_fs_yx_fsv16_opt(const layout& input_layout,
const layout& output_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv,
bool weak_restrictions = false);
bool deconvolution_b_fs_zyx_fsv16_opt(const layout &output_layout,
bool convolution_b_fs_zyx_fsv16_opt(const layout& input_layout,
const layout& output_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv);
bool convolution_bs_fs_yx_bsv16_fsv16_opt(const layout& input_layout,
const layout& output_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv);
bool convolution_fs_b_yx_fsv32_opt(const layout& input_layout,
const layout& output_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv,
bool weak_restrictions = false);
bool deconvolution_b_fs_zyx_fsv16_opt(const layout &input_layout,
const layout &weights_layout,
std::shared_ptr<const deconvolution> conv);
bool deconvolution_b_fs_yx_fsv16_opt(const layout &output_layout,
bool deconvolution_b_fs_yx_fsv16_opt(const layout &input_layout,
const layout &weights_layout,
std::shared_ptr<const deconvolution> conv);
bool users_for_convolution_byxf_opt(program_node const& node, uint32_t depth);

View File

@@ -354,8 +354,9 @@ bool layout_optimizer::convolution_byxf_opt(const layout& input_layout,
return false;
}
bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
const layout &weights_layout,
bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(const layout& input_layout,
const layout& output_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv,
bool weak_restrictions) {
// A set of rules that define when b_fs_yx_fsv16 mem format can be used for int8 case
@@ -367,15 +368,12 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
auto ks_y = weights_layout.size.spatial[1];
size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
if (weights_layout.format.group_num() > 0) {
out_features_per_group = weights_layout.size.batch[0];
}
size_t out_features_per_group = output_layout.size.feature[0] / conv->groups;
// Check for non-grouped or depthwise convolution
if (input_layout.format.dimension() == 4 &&
((ks_x == 7 && ks_y == 7) || (ks_x == 3 && ks_y == 3) || (ks_x == 1 && ks_y == 1) || (ks_x == 5 && ks_y == 5)) &&
weights_layout.size.batch[0] * weights_layout.size.group[0] >= 16 &&
output_layout.size.feature[0] >= 16 &&
((conv->groups == 1 && conv->split() == 1) ||
conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]) ||
conv->split() == static_cast<int32_t>(input_layout.size.feature[0])))
@@ -394,27 +392,27 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
return false;
}
// A set of rules that define when b_fs_yx_fsv16 mem format can be used for fp16/fp32 case
auto feature_block_size = 16;
auto correct_data_type = input_layout.data_type == data_types::f16 || input_layout.data_type == data_types::f32;
int32_t feature_block_size = 16;
int32_t correct_data_type = input_layout.data_type == data_types::f16 || input_layout.data_type == data_types::f32;
correct_data_type &= weights_layout.data_type == input_layout.data_type;
auto correct_batch = (input_layout.size.batch[0] == 1) || (input_layout.size.batch[0] > 1 && input_layout.data_type == data_types::f32);
auto correct_spatial_dims = input_layout.size.spatial[2] == 1 && input_layout.size.spatial[3] == 1;
auto required_feature_num = weak_restrictions ? feature_block_size / 2 : feature_block_size;
auto correct_in_feature = (input_layout.size.feature[0] >= required_feature_num &&
weights_layout.size.batch[0] * weights_layout.size.group[0] >= required_feature_num);
if (!correct_in_feature && input_layout.size.feature[0] <= 4 && weights_layout.size.batch[0] >= feature_block_size)
int32_t correct_batch = (input_layout.size.batch[0] == 1) || (input_layout.size.batch[0] > 1 && input_layout.data_type == data_types::f32);
int32_t correct_spatial_dims = input_layout.size.spatial[2] == 1 && input_layout.size.spatial[3] == 1;
int32_t required_feature_num = weak_restrictions ? feature_block_size / 2 : feature_block_size;
int32_t correct_in_feature = (input_layout.size.feature[0] >= required_feature_num &&
output_layout.size.feature[0] >= required_feature_num);
int32_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
int32_t out_features_per_group = output_layout.size.feature[0] / conv->groups;
if (!correct_in_feature && input_layout.size.feature[0] <= 4 && out_features_per_group >= feature_block_size)
correct_in_feature = true;
auto depthwise = conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]); // depthwise conv
auto out_features_per_group = weights_layout.size.batch[0];
auto in_features_per_group = weights_layout.size.feature[0];
auto grouped = ((feature_block_size % out_features_per_group == 0) &&
(feature_block_size % in_features_per_group == 0) &&
(feature_block_size / out_features_per_group > 1) &&
(feature_block_size / in_features_per_group > 1) &&
(out_features_per_group != 1) &&
(in_features_per_group != 1)) ||
((out_features_per_group % feature_block_size == 0 || feature_block_size % out_features_per_group == 0) &&
(in_features_per_group % feature_block_size == 0));
int32_t depthwise = conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]); // depthwise conv
int32_t grouped = ((feature_block_size % out_features_per_group == 0) &&
(feature_block_size % in_features_per_group == 0) &&
(feature_block_size / out_features_per_group > 1) &&
(feature_block_size / in_features_per_group > 1) &&
(out_features_per_group != 1) &&
(in_features_per_group != 1)) ||
((out_features_per_group % feature_block_size == 0 || feature_block_size % out_features_per_group == 0) &&
(in_features_per_group % feature_block_size == 0));
if (correct_data_type &&
correct_batch &&
correct_spatial_dims &&
@@ -424,18 +422,20 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
return false;
}
bool layout_optimizer::should_select_b_fs_yx_fsv16_layout(convolution_node const& node, layout const& output_or_weights_layout) {
bool layout_optimizer::should_select_b_fs_yx_fsv16_layout(convolution_node const& node, layout const& weights_layout) {
auto prim = node.get_primitive();
auto input_layout = node.get_dependency(0).get_output_layout();
auto const cond_denom = _total_conv > 0 ? 1.0f / static_cast<float>(_total_conv) : 1.0f;
auto fully_support_conv_num = _optimized_conv_count.at({format::b_fs_yx_fsv16, false});
auto partially_support_conv_num = _optimized_conv_count.at({format::b_fs_yx_fsv16, true});
auto current_conv_supports_layout = convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim);
auto output_layout = node.calc_output_layout();
auto current_conv_supports_layout = convolution_b_fs_yx_fsv16_opt(input_layout, output_layout, weights_layout, prim);
auto is_prev_conv_node_supports_layout = node.get_dependency(0).is_type<convolution>() &&
is_format_optimized(node.get_dependency(0).as<convolution>(), format::b_fs_yx_fsv16);
auto weak_restriction_cond = (partially_support_conv_num - fully_support_conv_num) * cond_denom < 0.15f;
auto current_conv_partially_supports_layout = convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim, true);
auto current_conv_partially_supports_layout = convolution_b_fs_yx_fsv16_opt(input_layout, output_layout, weights_layout, prim, true);
auto may_use_weak_restrictions = is_prev_conv_node_supports_layout || weak_restriction_cond;
return ((_optimization_attributes.b_fs_yx_fsv16_network) &&
@@ -443,15 +443,13 @@ bool layout_optimizer::should_select_b_fs_yx_fsv16_layout(convolution_node const
input_layout.format == format::b_fs_yx_fsv16;
}
bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout,
const layout &weights_layout,
bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(const layout& input_layout,
const layout& output_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv) {
// A set of rules that define when b_fs_zyx_fsv16 mem format can be used
size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
if (weights_layout.format.group_num() > 0) {
out_features_per_group = weights_layout.size.batch[0];
}
size_t out_features_per_group = output_layout.size.feature[0] / conv->groups;
// Check for fsv16 imad kernel
if ((input_layout.format.dimension() == 5) &&
@@ -465,12 +463,13 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
bool data_type_ver = input_layout.data_type == data_types::f16 || input_layout.data_type == data_types::f32;
bool w_layout = weights_layout.data_type == input_layout.data_type;
bool single_dilation = conv->dilation == tensor(1);
bool groups_ver = conv->groups == 1 || weights_layout.size.batch[0] % 16 == 0
|| (conv->groups > 1 && weights_layout.size.batch[0] == 8);
bool groups_ver = conv->groups == 1 || out_features_per_group % 16 == 0
|| (conv->groups > 1 && out_features_per_group == 8);
return format_ver && data_type_ver && w_layout && single_dilation && groups_ver;
}
bool layout_optimizer::convolution_bs_fs_yx_bsv16_fsv16_opt(const layout &input_layout,
bool layout_optimizer::convolution_bs_fs_yx_bsv16_fsv16_opt(const layout& input_layout,
const layout& output_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv) {
// A set of rules that define when bs_fs_yx_bsv16_fsv16 mem format can be used
@@ -488,21 +487,22 @@ bool layout_optimizer::convolution_bs_fs_yx_bsv16_fsv16_opt(const layout &input_
auto ks_x = weights_layout.size.spatial[0];
auto ks_y = weights_layout.size.spatial[1];
int8_sup &= (input_layout.size.spatial[2] == 1 && ((ks_x == 1 && ks_y == 1) || (ks_x == 3 && ks_y == 3) || (ks_x == 7 && ks_y == 7)) &&
weights_layout.size.batch[0] % 32 == 0 && conv->split() == 1 && conv->dilation == tensor{1});
output_layout.size.feature[0] % 32 == 0 && conv->split() == 1 && conv->dilation == tensor{1});
return (int8_sup || fp16_ver || fp32_ver) && correct_feature && correct_batch && single_group;
}
bool layout_optimizer::convolution_fs_b_yx_fsv32_opt(layout const& input_layout,
layout const& weights_layout,
bool layout_optimizer::convolution_fs_b_yx_fsv32_opt(const layout& input_layout,
const layout& output_layout,
const layout& weights_layout,
std::shared_ptr<const convolution> conv,
bool weak_restrictions) {
auto ofm = weights_layout.size.batch[0] * weights_layout.size.group[0];
auto ofm = output_layout.size.feature[0];
// A set of rules that define when fs_b_yx_fsv32 mem format can be used
bool correct_batch = input_layout.size.batch[0] > 1;
bool correct_in_feature = input_layout.size.feature[0] >= 16;
bool correct_out_feature = weak_restrictions ? ofm >= 16 : ofm > 16;
bool dw_conv = static_cast<int>(conv->groups) == weights_layout.size.group[0];
bool dw_conv = static_cast<int>(conv->groups) == input_layout.size.feature[0];
if (!correct_in_feature && input_layout.size.feature[0] == 3 && conv->groups == 1) { // bfyx with 3 feature -> fs_b_yx_fsv32 case
correct_in_feature = true;
}
@@ -672,7 +672,7 @@ format layout_optimizer::imad_case(convolution_node const& node) const {
layout layout_optimizer::get_expected_layout(layout const& current_layout,
convolution_node const& node,
layout const& output_or_weights_layout) {
layout const& weights_layout) {
auto prim = node.get_primitive();
auto expected_tensor = current_layout.size;
auto expected_data_type = current_layout.data_type;
@@ -681,23 +681,24 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
const float cond_denom = _total_conv > 0 ? 1.0f / static_cast<float>(_total_conv) : 1.0f;
auto output_layout = node.calc_output_layout();
if ((input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8)) {
if ((_optimization_attributes.bs_fs_yx_bsv16_fsv16_network && expected_tensor.batch[0] % 16 == 0 &&
convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim))) {
expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
} else if ((_optimization_attributes.b_fs_yx_fsv16_network &&
convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
convolution_b_fs_yx_fsv16_opt(input_layout, output_layout, weights_layout, prim))) {
expected_format = cldnn::format::b_fs_yx_fsv16;
} else if ((_optimization_attributes.b_fs_zyx_fsv16_network &&
convolution_b_fs_zyx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
convolution_b_fs_zyx_fsv16_opt(input_layout, output_layout, weights_layout, prim))) {
expected_format = cldnn::format::b_fs_zyx_fsv16;
} else {
expected_format = imad_case(node);
}
expected_tensor = current_layout.size;
} else if (_optimization_attributes.b_fs_zyx_fsv16_network &&
convolution_b_fs_zyx_fsv16_opt(input_layout,
output_or_weights_layout, prim)) {
convolution_b_fs_zyx_fsv16_opt(input_layout, output_layout, weights_layout, prim)) {
expected_tensor = current_layout.size;
if ((current_layout.data_type == data_types::f32 && expected_tensor.batch[0] % 16 == 0) ||
(current_layout.data_type == data_types::f16 && expected_tensor.batch[0] % 32 == 0))
@@ -709,27 +710,29 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
expected_tensor = current_layout.size;
expected_format = cldnn::format::bfzyx;
} else if (_optimization_attributes.bs_fs_yx_bsv16_fsv16_network &&
convolution_bs_fs_yx_bsv16_fsv16_opt(node.input().get_output_layout(), output_or_weights_layout, prim)) {
convolution_bs_fs_yx_bsv16_fsv16_opt(node.input().get_output_layout(), output_layout, weights_layout, prim)) {
expected_tensor = current_layout.size;
expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
} else if (_optimization_attributes.fs_b_yx_fsv32_network && !node.get_transposed() &&
((convolution_fs_b_yx_fsv32_opt(node.get_dependency(0).get_output_layout(),
node.get_dependency(1).get_output_layout(), prim) ||
((convolution_fs_b_yx_fsv32_opt(input_layout,
output_layout,
weights_layout, prim) ||
(((node.get_dependency(0).is_type<convolution>() && is_format_optimized(node.get_dependency(0).as<convolution>(), format::fs_b_yx_fsv32))
|| (_optimized_conv_count.at({format::fs_b_yx_fsv32, false}) * cond_denom > 0.8f)) &&
convolution_fs_b_yx_fsv32_opt(node.get_dependency(0).get_output_layout(),
node.get_dependency(1).get_output_layout(), prim, true))))) {
convolution_fs_b_yx_fsv32_opt(input_layout,
output_layout,
weights_layout, prim, true))))) {
// Chose fs_b_yx_fsv32 layout in two cases: 1-st: the current conv primitive totally supports fs_b_yx_fsv32 layout
// 2-nd: the previous conv primitive supports fs_b_yx_fsv32 layout and
// current conv primitives supports this one with weak restrictions -
// that should be cheaper than reordering data to another layout
expected_tensor = current_layout.size;
expected_format = format::fs_b_yx_fsv32;
} else if (should_select_b_fs_yx_fsv16_layout(node, output_or_weights_layout)) {
} else if (should_select_b_fs_yx_fsv16_layout(node, weights_layout)) {
expected_tensor = current_layout.size;
expected_format = cldnn::format::b_fs_yx_fsv16;
} else if (current_layout.data_type == data_types::f16 &&
layout_optimizer::convolution_byxf_opt(node.input().get_output_layout(), current_layout, output_or_weights_layout, node) &&
layout_optimizer::convolution_byxf_opt(input_layout, current_layout, weights_layout, node) &&
(users_for_convolution_byxf_opt(node, 2) || deps_for_convolution_byxf_opt(node, 2)) &&
// todo: remove this condition when yxfb optimizations will be disabled
current_layout.format != cldnn::format::yxfb && current_layout.size.batch[0] == 1) {
@@ -739,7 +742,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
current_layout.format == format::os_is_yx_osv16_isv4) {
// imad case
// nothing to do, just go out from here.
} else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim) ||
} else if (layout_optimizer::convolution_bfyx_opt(current_layout, weights_layout, prim) ||
(_output_size_handling_enabled && prim->with_output_size) || node.get_transposed()) {
{
expected_tensor = current_layout.size;
@@ -907,20 +910,21 @@ void layout_optimizer::set_optimization_attribute(optimization_attributes_type a
bool layout_optimizer::is_format_optimized(const convolution_node& node, const format& format, bool use_weak_restrictions) {
auto input_layout = node.input().get_output_layout();
auto weights_layout = node.weights().get_output_layout();
auto output_layout = node.calc_output_layout();
auto prim = node.get_primitive();
switch (format) {
case format::b_fs_yx_fsv16:
return convolution_b_fs_yx_fsv16_opt(input_layout, weights_layout, prim, use_weak_restrictions) &&
return convolution_b_fs_yx_fsv16_opt(input_layout, output_layout, weights_layout, prim, use_weak_restrictions) &&
// Work-around for inability to use b_fs_yx_fsv16 and winograd together
!should_use_winograd_2x3_s1(prim, input_layout, weights_layout, _output_size_handling_enabled);
case format::b_fs_zyx_fsv16:
case format::bs_fs_zyx_bsv16_fsv16:
return convolution_b_fs_zyx_fsv16_opt(input_layout, weights_layout, prim);
return convolution_b_fs_zyx_fsv16_opt(input_layout, output_layout, weights_layout, prim);
case format::fs_b_yx_fsv32:
return convolution_fs_b_yx_fsv32_opt(input_layout, weights_layout, prim);
return convolution_fs_b_yx_fsv32_opt(input_layout, output_layout, weights_layout, prim);
case format::bs_fs_yx_bsv16_fsv16:
return convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, weights_layout, prim);
return convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim);
default:
throw std::invalid_argument(
"[Layout optimizer] Other formats in is_format_optimized(...) method are not implemented!");