[GPU] DG2 layout optimizer improvements (#8074)

* [GPU] DG2 layout optimizer improvements * Fix FP16/FP32 regressions and optimize first qunatize/reorder
2021-10-25 10:55:40 +03:00 · 2021-10-25 10:55:40 +03:00 · e6f0b8fe88
commit e6f0b8fe88
parent 8b0bec4e04
3 changed files with 120 additions and 10 deletions
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
@ -187,12 +187,20 @@ void prepare_primitive_fusing::fuse_reorders(program &p) {
 void prepare_primitive_fusing::fuse_activations(program &p) {
    bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
    std::map<primitive_id, std::vector<primitive_id>> fusing_history;
+    bool use_onednn_impls = false;
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    auto& engine = p.get_engine();
+    if (engine.get_device_info().supports_immad && engine.configuration().queue_type == queue_types::in_order)
+        use_onednn_impls = true;
+#endif
+
    auto itr = p.get_processing_order().begin();
    while (itr != p.get_processing_order().end()) {
        auto node_itr = itr++;
        auto& node = (*node_itr);

-        program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history](activation_node& node) {
+        program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history, &use_onednn_impls](activation_node& node) {
            auto& input = node.input();
            auto id = node.id();
            // Restrictions:
@ -239,6 +247,9 @@ void prepare_primitive_fusing::fuse_activations(program &p) {
                    return;
            }

+            if (input.is_type<reshape>() && use_onednn_impls)
+                return;
+
            if (input.get_fused_primitives().empty()) {
                input.add_fused_activation(node.get_primitive()->activation_function, node.get_primitive()->additional_params);
                for (size_t i = 0; i < node.get_fused_activations_funcs().size(); i++) {
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
@ -56,7 +56,12 @@ void remove_redundant_reorders::run(program& p) {

            // Avoid different data types between input and output
            auto same_data_type = input.get_output_layout().data_type == output_layout.data_type;
-            if (!same_data_type)
+            auto i8_u8_input = input.get_output_layout().data_type == data_types::i8 ||
+                               input.get_output_layout().data_type == data_types::u8;
+            auto quantize_user = node.get_users().front()->is_type<quantize>() &&
+                                 node.get_users().size() == 1;
+
+            if (!same_data_type && !(i8_u8_input && quantize_user))
                continue;

            // Avoid optimization of nv12 reorder
@ -323,7 +328,7 @@ void remove_redundant_reorders::run(program& p) {
        auto& dep = node_ptr->get_dependency(0);
        if (!usr->is_type<quantize>() ||
            (dep.get_output_layout().format != format::b_fs_yx_fsv16 &&
-             dep.get_output_layout().format != format::fs_b_yx_fsv32 &&
+             (lo.get_optimization_attributes().use_onednn_impls || dep.get_output_layout().format != format::fs_b_yx_fsv32) &&
             dep.get_output_layout().format != format::bfyx))
            continue;

@ -348,7 +353,7 @@ void remove_redundant_reorders::run(program& p) {
        auto& dep = node->get_dependency(0);

        if (!(usr->is_type<convolution>()) ||
-            usr->get_output_layout().data_type != dep.get_output_layout().data_type ||
+            node->get_output_layout().data_type != dep.get_output_layout().data_type ||
            dep.get_output_layout().format != format::bfyx)
            return false;
        if (usr->as<convolution>().get_preferred_impl_type() != impl_types::onednn &&
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@ -178,6 +178,8 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
    auto prev_output_layout = prev.get_output_layout();
    auto next_output_layout = next.get_output_layout();
    auto prev_dt = prev.get_output_layout().data_type;
+    auto next_dt = next.get_output_layout().data_type;
+    auto use_onednn_impls = _optimization_attributes.use_onednn_impls;

    auto is_input_idx = [&](size_t idx) -> bool {
        if (&next.get_dependency(idx) == &prev)
@ -191,7 +193,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
        return true;

    if (next.is_type<pooling>() &&
-        ((prev_simple && next_simple) ||
+        (((prev_simple && next_simple) && (prev_dt == next_dt)) ||
        ((fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::bfyx) && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))
        return true;

@ -223,9 +225,12 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,

    // Additional check: fmt_prev == fmt_next is added only when onednn is enabled.
    if (next.is_type<convolution>() &&
-        fmt_prev == format::bfyx && (!get_optimization_attributes().use_onednn_impls || fmt_prev == fmt_next) &&
+        (!use_onednn_impls || fmt_prev == fmt_next) &&
+        (fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) &&
        ((fmt_next == format::fs_b_yx_fsv32 && next.as<convolution>().get_primitive()->groups == 1) ||
        (fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
+        (fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
+        (fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
        (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3) ||
        (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3 &&
        (next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8))))
@ -239,8 +244,10 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
        return true;

    if (next.is_type<convolution>() &&
-        fmt_prev == format::b_fs_yx_fsv4 &&
+        (fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bs_fs_yx_bsv4_fsv4) &&
        ((fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
+        (fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
+        (fmt_next == format::bs_fs_yx_bsv4_fsv4 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
        (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
        (prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))))
        return true;
@ -263,12 +270,51 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
            next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3))
        return true;

+    if (use_onednn_impls) {
+        if (next.is_type<eltwise>() && (fmt_prev == format::bfyx) && (fmt_next == format::bs_fs_yx_bsv4_fsv2) &&
+            prev.is_input() && (prev_dt == data_types::u8 || prev_dt == data_types::i8))
+            return true;
+
+        if (next.is_type<convolution>() && (prev.is_type<eltwise>() || prev.is_type<quantize>()) &&
+            (fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) &&
+            ((fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
+            (fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4))))
+            return true;
+
+        if (next.is_type<quantize>())
+            return true;
+
+        if (next.is_type<permute>()) {
+            auto is_rotating_except_batch = [](const std::vector<uint16_t>& order) {
+                // Target transform: Rotate feature dim to back to be taken as inner-most axis
+                // ex) 0(b), 4(f), 1(z), 2(y), 3(x)
+                // ex) 0(b), 3(f), 1(y), 2(x)
+                if ((int32_t) order[1] != order.size() - 1) return false;
+                if ((int32_t) order[0] != 0) return false;
+                for (int32_t i = 2; i < (int32_t) order.size(); ++i) {
+                    if ((int32_t)order[i] !=  (i - 1)) return false;
+                }
+                return true;
+            };
+
+            auto& permute_order = next.as<permute>().get_primitive()->permute_order;
+            if ((fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::b_fs_yx_fsv32 || fmt_prev == format::b_fs_zyx_fsv32 ||
+                fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_zyx_fsv16 || fmt_prev == format::bs_fs_yx_bsv16_fsv16)
+                && permute_order[1] == 2
+                && (!is_rotating_except_batch(permute_order))) {
+                    return false;
+            }
+            return true;
+        }
+    }
+
    return false;
 }

 bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node& next, format fmt_prev, format fmt_next) {
    auto dt_prev = prev.get_output_layout().data_type;
    auto dt_next = next.get_output_layout().data_type;
+    auto use_onednn_impls = _optimization_attributes.use_onednn_impls;

    if (prev.is_type<reorder>())
        return true;
@ -283,7 +329,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
        return true;

    if (prev.is_type<quantize>() &&
-        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 ||
+        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_zyx_fsv32 || (fmt_next == format::b_fs_yx_fsv32 && !use_onednn_impls) ||
         fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
        return true;

@ -310,6 +356,11 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
        return true;
    }

+    if (use_onednn_impls) {
+        if (prev.is_type<convolution>() && fmt_next == format::bs_fs_yx_bsv32_fsv16 && fmt_prev == format::bs_fs_yx_bsv4_fsv2)
+            return true;
+    }
+
    return false;
 }

@ -1270,17 +1321,60 @@ format layout_optimizer::get_preferred_format(program_node& node) {
            node.as<detection_output>(),
            layout{ data_types::f32, format::bfyx, tensor{} }).format;
    } else if (node.is_type<quantize>()) {
+        bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
        auto layout = node.get_output_layout();
        if (layout.format.spatial_num() == 2 &&
            (layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
            layout.size.batch[0] % 16 == 0) {
-            if (layout.size.feature[0] > 8) {
+            if (use_onednn_impls && layout.size.batch[0] % 32 == 0) {
+                if (node.get_users().size() == 1 && node.get_users().front()->is_type<convolution>()) {
+                    auto& conv = node.get_users().front()->as<convolution>();
+                    auto ws = conv.get_dependency(1).get_output_layout().size;
+                    if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1)
+                        expected = format::bfyx;
+                    else
+                        expected = format::bs_fs_yx_bsv4_fsv4;
+
+                    auto conv_output_layout = conv.get_output_layout();
+                    auto weights_layout = conv.weights(0).get_output_layout();
+                    format expected_conv_fmt = get_expected_layout(conv_output_layout, conv, weights_layout).format;
+                    if (expected == format::bfyx && expected_conv_fmt == format::bs_fs_yx_bsv32_fsv32 &&
+                        layout.size.feature[0] % 32 == 0) {
+                        expected = expected_conv_fmt;
+                    }
+                } else {
+                    expected = format::bfyx;
+                }
+            } else if (layout.size.feature[0] > 8) {
                expected = format::b_fs_yx_fsv16;
            } else {
-                expected = format::b_fs_yx_fsv4;
+                if (use_onednn_impls && layout.size.batch[0] % 16 == 0 && node.get_users().front()->is_type<convolution>()) {
+                    auto& conv = node.get_users().front()->as<convolution>();
+                    auto ws = conv.get_dependency(1).get_output_layout().size;
+                    if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1)
+                        expected = format::bfyx;
+                    else
+                        expected = format::bs_fs_yx_bsv4_fsv4;
+                } else {
+                    expected = format::b_fs_yx_fsv4;
+                }
            }
        } else if (layout.format.spatial_num() == 3 && (layout.data_type == data_types::i8 || layout.data_type == data_types::u8)) {
            expected = format::b_fs_zyx_fsv16;
+        } else if (use_onednn_impls) {
+            if (node.get_users().size() == 1 && node.get_users().front()->is_type<convolution>() &&
+                (layout.data_type == data_types::i8 || layout.data_type == data_types::u8)) {
+                auto& conv = node.get_users().front()->as<convolution>();
+                auto conv_output_layout = conv.get_output_layout();
+                auto weights_layout = conv.weights(0).get_output_layout();
+                format expected_conv_fmt = get_expected_layout(conv_output_layout, conv, weights_layout).format;
+                if (layout.format.spatial_num() == 2 && layout.size.feature[0] % 32 == 0 && expected_conv_fmt == format::b_fs_yx_fsv32)
+                    expected = expected_conv_fmt;
+                else
+                    expected = format::bfyx;
+            } else {
+                expected = format::bfyx;
+            }
        }
    } else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
        expected = node.get_output_layout().format;