[GPU] Rollback to cldnn from onednn and some fixes and improvements (#8761)

2021-12-02 11:28:38 +03:00 · 2021-12-02 11:28:38 +03:00 · 607814828d
commit 607814828d
parent 9e3b9b8fbc
6 changed files with 146 additions and 70 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.cpp
@ -32,6 +32,9 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
    k.EnableOutputLayout(DataLayout::bf);
    k.EnableOutputLayout(DataLayout::fb);
    k.EnableOutputLayout(DataLayout::bfyx);
@ -41,6 +44,9 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
    k.EnableTensorOffset();
    k.EnableTensorPitches();
    k.EnableBatching();
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp
@ -8,9 +8,11 @@
 #include "program_node.h"
 #include "layout_optimizer.h"
 #include "cldnn/graph/program.hpp"
 #include "cldnn/runtime/debug_configuration.hpp"
 #include "program_helpers.h"
 #include "binary_convolution_inst.h"
 #include "mvn_inst.h"
 #include "to_string_utils.h"
 #include <vector>
 #include <memory>
@ -18,29 +20,6 @@
 #include <map>
 #include <set>
 #define CLDNN_REORDER_INPUTS_VERBOSE 0
 // Prints overall statistics of performed selection, such as number of reorders required.
 #define CLDNN_REORDER_INPUTS_VERBOSE_STATISTICS          (CLDNN_REORDER_INPUTS_VERBOSE > 0)
 // Prints special cases and work-arounds matched.
 #define CLDNN_REORDER_INPUTS_VERBOSE_PATTERN_MATCH       (CLDNN_REORDER_INPUTS_VERBOSE > 1)
 // Prints full list of preferred formats for each node.
 #define CLDNN_REORDER_INPUTS_VERBOSE_PREFERRED           (CLDNN_REORDER_INPUTS_VERBOSE > 2)
 // Prints full list of selected formats for each node.
 #define CLDNN_REORDER_INPUTS_VERBOSE_FORMATS             (CLDNN_REORDER_INPUTS_VERBOSE > 2)
 #if CLDNN_REORDER_INPUTS_VERBOSE
 #include "to_string_utils.h"
 #include <iostream>
 #define CLDNN_REORDER_INPUTS_LOG(x) std::cout << "[clDNN][reorder_inputs] " << x << std::endl
 #endif
 #if CLDNN_REORDER_INPUTS_VERBOSE_PATTERN_MATCH
 #define CLDNN_REORDER_INPUTS_PATTERN_MATCH_LOG(desc, id) CLDNN_REORDER_INPUTS_LOG(id << " matched for pattern: " << desc)
 #else
 #define CLDNN_REORDER_INPUTS_PATTERN_MATCH_LOG(desc, id) do { } while (false)
 #endif
 using namespace cldnn;
 // ToDo remove friendship relation from program
@ -52,7 +31,17 @@ void reorder_inputs::run(program& p) { run(p, _lo, _rf); }
 namespace {
 std::map<program_node*, format::type> get_preferred_formats(program& p, layout_optimizer& lo) {
    GPU_DEBUG_GET_INSTANCE(debug_config);
    std::map<program_node*, format::type> fmt_map;
 #ifdef ENABLE_ONEDNN_FOR_GPU
    size_t onednn_impls_counter = 0;
    size_t all_impls_counter = 0;
    const float onednn_min_threshold = 0.1f;
    bool should_update_fmt_map = false;
    // Calculate onednn kernels number and all kernels number inside the network
    for (auto n : p.get_processing_order()) {
        if (!n->is_in_data_flow())
            continue;
@ -62,6 +51,51 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
        fmt_map[n] = ex;
        n->set_preferred_impl_type(impl);
        if (impl == impl_types::onednn)
            onednn_impls_counter++;
        all_impls_counter++;
    }
    float onednn_usage_ratio = all_impls_counter ? static_cast<float>(onednn_impls_counter) / static_cast<float>(all_impls_counter) : 0.f;
    GPU_DEBUG_IF(debug_config->verbose >= 1) {
        GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
        GPU_DEBUG_COUT << "Onednn kernels number: " << onednn_impls_counter << " from " << all_impls_counter
                       << " (" << onednn_usage_ratio * 100.f << "%)" << std::endl;
        GPU_DEBUG_COUT << "Onednn usage threshold: " << onednn_min_threshold * 100.f << "%" << std::endl;
    }
    // Reverted to cldnn way for cases when onednn kernels number inside the whole network is extremely low =>
    // improvements from onednn usage less than losses due to unoptimized formats for cldnn kernels, extra reorders, etc.
    if (onednn_usage_ratio < onednn_min_threshold && lo.get_optimization_attributes().use_onednn_impls) {
        should_update_fmt_map = true;
        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0);
        GPU_DEBUG_IF(debug_config->verbose >= 1) {
            GPU_DEBUG_COUT << "The return to clDNN implementations" << std::endl;
        }
    }
    GPU_DEBUG_IF(debug_config->verbose >= 1) {
        GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
    }
 #endif // ENABLE_ONEDNN_FOR_GPU
 #ifdef ENABLE_ONEDNN_FOR_GPU
    if (should_update_fmt_map)
 #endif
    {
        for (auto n : p.get_processing_order()) {
            if (!n->is_in_data_flow())
                continue;
            auto ex = lo.get_preferred_format(*n);
            auto impl = lo.get_preferred_impl_type(*n, ex);
            fmt_map[n] = ex;
            n->set_preferred_impl_type(impl);
        }
    }
    return fmt_map;
 }
@ -408,34 +442,34 @@ void insert_reorders(program& p, const std::map<program_node*, format::type>& fm
 }  // namespace
 void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) {
    GPU_DEBUG_GET_INSTANCE(debug_config);
    auto fmt_map = get_preferred_formats(p, lo);
-#if CLDNN_REORDER_INPUTS_VERBOSE_PREFERRED
+
-    {
+    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-        CLDNN_REORDER_INPUTS_LOG("Preferred formats:");
+        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Preferred formats:" << std::endl;
        for (auto& node_fmt : fmt_map) {
            if (node_fmt.second != format::any) {
-                CLDNN_REORDER_INPUTS_LOG("  " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second));
+                GPU_DEBUG_COUT << "[clDNN][reorder_inputs]   " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
            }
        }
    }
-#endif
+
    propagate_formats(p, fmt_map, lo);
    minimize_local_reorders(p, fmt_map, lo);
-#if CLDNN_REORDER_INPUTS_VERBOSE_FORMATS
+    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-    {
+        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Selected formats:" << std::endl;
        CLDNN_REORDER_INPUTS_LOG("Selected formats:");
        for (auto node_ptr : p.get_processing_order()) {
            if (fmt_map.count(node_ptr) == 0)
                continue;
            auto fmt = fmt_map.at(node_ptr);
-            CLDNN_REORDER_INPUTS_LOG("  " << node_ptr->id() << " " << fmt_to_str(fmt));
+            GPU_DEBUG_COUT << "[clDNN][reorder_inputs]   " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
        }
    }
-#endif
+
-#if CLDNN_REORDER_INPUTS_VERBOSE_STATISTICS
+    GPU_DEBUG_IF(debug_config->verbose >= 1) {
    {
        reorder_cnt total_reorder_count = std::accumulate(
            p.get_processing_order().begin(),
            p.get_processing_order().end(),
@ -447,8 +481,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
            return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes };
        });
        // Divide results by two as above function will each reorder from both sides
-        CLDNN_REORDER_INPUTS_LOG("Total number of reorders: " << total_reorder_count.number / 2);
+        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
-        CLDNN_REORDER_INPUTS_LOG("Total elements count of all reorders: " << total_reorder_count.total_sizes / 2);
+        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
        // Count number of reorders that will be fused
        size_t nodes_with_fusing = 0;
@ -464,9 +498,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
                }
            }
        }
-        CLDNN_REORDER_INPUTS_LOG("Number of nodes with fused reorders: " << nodes_with_fusing);
+        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
        GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
    }
 #endif
    insert_reorders(p, fmt_map, rf, lo);
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/convolution.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/convolution.cpp
@ -154,38 +154,50 @@ namespace detail {
 attach_convolution_impl::attach_convolution_impl() {
    implementation_map<convolution>::add(impl_types::ocl, convolution_impl::create, {
        std::make_tuple(data_types::f32, format::yxfb),
        std::make_tuple(data_types::f16, format::yxfb),
        std::make_tuple(data_types::f32, format::bfyx),
        std::make_tuple(data_types::f16, format::bfyx),
        std::make_tuple(data_types::i8, format::bfyx),
        std::make_tuple(data_types::u8, format::bfyx),
        std::make_tuple(data_types::f32, format::yxfb),
        std::make_tuple(data_types::f16, format::yxfb),
        std::make_tuple(data_types::f32, format::bfzyx),
        std::make_tuple(data_types::f16, format::bfzyx),
        std::make_tuple(data_types::i8, format::bfzyx),
        std::make_tuple(data_types::u8, format::bfzyx),
        std::make_tuple(data_types::f32, format::winograd_2x3_s1_data),
        std::make_tuple(data_types::f16, format::winograd_2x3_s1_data),
        std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
        std::make_tuple(data_types::f32, format::byxf),
        std::make_tuple(data_types::f16, format::byxf),
-        std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
+        std::make_tuple(data_types::u8, format::byxf),
        std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
        std::make_tuple(data_types::i8, format::byxf),
-        std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
+
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
-        std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
+        std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
        std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
        std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
        std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/deconvolution.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/deconvolution.cpp
@ -119,13 +119,17 @@ attach_deconvolution_impl::attach_deconvolution_impl() {
        std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
        std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
        std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
        std::make_tuple(data_types::f16, format::yxfb),
        std::make_tuple(data_types::f16, format::bfyx),
        std::make_tuple(data_types::f16, format::bfzyx),
        std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
        std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
        std::make_tuple(data_types::f32, format::byxf),
        std::make_tuple(data_types::f16, format::byxf),
        std::make_tuple(data_types::i8, format::bfyx),
@ -134,10 +138,14 @@ attach_deconvolution_impl::attach_deconvolution_impl() {
        std::make_tuple(data_types::u8, format::bfzyx),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
        std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
        std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
        std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
        std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
        std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
        std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
    });
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/mvn.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/mvn.cpp
@ -80,6 +80,17 @@ attach_mvn_impl::attach_mvn_impl() {
        std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
        std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
        // TODO: uncomment this code when fsv32 optimizations for MVN will be implemented
        /*std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::f32, format::b_fs_zyx_fsv32),
        std::make_tuple(data_types::f16, format::b_fs_zyx_fsv32),
        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),*/
        std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@ -867,7 +867,8 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
                auto in_dt = in_layout.data_type;
                auto out_dt = out_layout.data_type;
                if ((out_layout.count() == in_layout.count()) &&
-                    (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
+                    (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
                    fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
                    onednn_valid_post_ops = false;
                    break;
                }
@ -890,6 +891,10 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
        /* ***************************** OneDNN impls format selection part ****************************** */
        bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0);
        // TODO: uncomment this code when corresponding fsv32 optimizations inside clDNN will be implemented
        // bool i8_u8_output = output_layout.data_type == data_types::u8 || output_layout.data_type == data_types::i8;
        // bool is_first_conv = input_layout.size.feature[0] < 4;
        if (i8_u8_input) {
            if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) {
                if (input_layout.size.batch[0] % 16 == 0) {
@ -929,14 +934,21 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
        } else if (input_layout.data_type == data_types::f16 &&
                convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim) &&
                (output_layout.data_type == input_layout.data_type ||
-                !data_type_traits::is_floating_point(input_layout.data_type))) {
+                !data_type_traits::is_floating_point(input_layout.data_type)) && is_2d) {
            expected_tensor = current_layout.size;
            if (prim->groups == 1 || (output_layout.size.feature[0] % 16 == 0 && input_layout.size.feature[0] % 16 == 0)) {
                expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16;
            } else {
                expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
            }
-        }
+        } // TODO: add this case when corresponding fsv32 optimizations inside clDNN will be implemented
        //else if (input_layout.data_type == data_types::f32 && i8_u8_output && !is_first_conv && is_2d) {
        //    if (input_layout.size.batch[0] % 16 == 0) {
        //        expected_format = cldnn::format::bs_fs_yx_bsv32_fsv32;
        //    } else {
        //        expected_format = cldnn::format::b_fs_yx_fsv32;
        //    }
        //}
    } else {
        /* *************************** Native impls format selection part ************************** */
        if (i8_u8_input) {
@ -1293,7 +1305,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
                auto in_dt = in_layout.data_type;
                auto out_dt = out_layout.data_type;
                if ((out_layout.count() == in_layout.count()) &&
-                    (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
+                    (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
                    fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
                    impl_candidate = impl_types::ocl;
                    break;
                }
@ -1352,7 +1365,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
                auto in_dt = in_layout.data_type;
                auto out_dt = out_layout.data_type;
                if ((out_layout.count() == in_layout.count()) &&
-                    (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
+                    (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
                    fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
                    impl_candidate = impl_types::ocl;
                    break;
                }
@ -1380,11 +1394,11 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
 format layout_optimizer::get_preferred_format(program_node& node) {
    format expected = format::any;
    auto output_layout = node.get_output_layout();
    bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
    if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
        expected = _forcing_map.at(node.id()).first;
    } else if (node.is_type<convolution>()) {
        bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
        auto& conv_node = node.as<convolution>();
        auto weights_layout = conv_node.weights(0).get_output_layout();
        expected = get_expected_layout(output_layout, conv_node, weights_layout).format;
@ -1400,21 +1414,12 @@ format layout_optimizer::get_preferred_format(program_node& node) {
        auto& bconv_node = node.as<binary_convolution>();
        auto weights_layout = bconv_node.weights(0).get_output_layout();
        expected = get_expected_layout(output_layout, bconv_node, weights_layout).format;
    } else if (node.is_type<pooling>() && _optimization_attributes.use_onednn_impls) {
         auto in_layout = node.get_dependency(0).get_output_layout();
         if (output_layout.size.batch[0] % 16 == 0 || output_layout.size.batch[0] == 8) {
             if (!data_type_traits::is_floating_point(in_layout.data_type) && in_layout.data_type != output_layout.data_type) {
                 expected = format::b_fs_yx_fsv16;
             }
         }
    } else if (node.is_type<detection_output>()) {
        expected = get_expected_layout(
            output_layout,
            node.as<detection_output>(),
            layout{ data_types::f32, format::bfyx, tensor{} }).format;
    } else if (node.is_type<quantize>()) {
        bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
        auto layout = node.get_output_layout();
        if (layout.format.spatial_num() == 2 &&
            (layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&