[GPU] New OneDNN post-ops optimizations and fusing tests (#8056)

* [GPU] New OneDNN post-ops optimizations and fusing tests * [GPU] Code refactoring by github comments Failing test disabled in #8109
2021-10-28 09:30:08 +03:00 · 2021-10-28 09:30:08 +03:00 · 262c87c6e1
commit 262c87c6e1
parent 6f862822e1
2 changed files with 475 additions and 78 deletions
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/primitive_onednn_base.h
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/primitive_onednn_base.h
@ -161,6 +161,33 @@ protected:
            }
        };
        // Check that post-op type is any optimized
        auto type_is_any_optimized = [](onednn_post_op_type type) -> bool {
            return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum ||
                   type == onednn_post_op_type::optimized_eltwise;
        };
        // Check that post-op type is eltwise
        auto type_is_eltwise = [](onednn_post_op_type type) -> bool {
            return type == onednn_post_op_type::eltwise_round || type == onednn_post_op_type::eltwise_linear ||
                   type == onednn_post_op_type::eltwise_clip  || type == onednn_post_op_type::eltwise_act;
        };
        // Check that post-op type is binary_add or binary_mul
        auto type_is_binary_add_or_mul = [](onednn_post_op_type type) -> bool {
            return type == onednn_post_op_type::binary_add || type == onednn_post_op_type::binary_mul;
        };
        // Simple post-op type checks
        auto type_is_optimized         = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized; };
        auto type_is_eltwise_linear    = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::eltwise_linear; };
        auto type_is_optimized_eltwise = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_eltwise; };
        auto type_is_binary_add        = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_add; };
        auto type_is_binary_mul        = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_mul; };
        auto type_is_sum               = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::sum; };
        auto type_is_optimized_sum     = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_sum; };
        auto type_is_scale             = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::scale; };
        auto& cur_post_ops = onednn_fusing_map[node_id];
        size_t cur_post_op_idx = 1;
@ -169,11 +196,11 @@ protected:
        // Check and update post-op map if we already optimized something
        for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) {
-            if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized_sum)
+            if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type))
                cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum;
-            else if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized_eltwise)
+            else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type))
                cur_post_ops[post_op_idx].op_type = onednn_post_op_type::eltwise_linear;
-            else if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized)
+            else if (type_is_optimized(cur_post_ops[post_op_idx].op_type))
                cur_post_ops.erase(cur_post_ops.begin() + post_op_idx);
        }
@ -186,8 +213,7 @@ protected:
            auto prev_type = cur_post_ops[prev_post_op_idx].op_type;
            // Ignore optimized operations for "previous" operation in our operation pair
-            while ((prev_type == onednn_post_op_type::optimized || prev_type == onednn_post_op_type::optimized_sum ||
+            while (type_is_any_optimized(prev_type) && cur_post_op_idx < post_ops_size - 1) {
                   prev_type == onednn_post_op_type::optimized_eltwise) && cur_post_op_idx < post_ops_size - 1) {
                prev_post_op_idx++;
                cur_post_op_idx++;
                prev_type = cur_post_ops[prev_post_op_idx].op_type;
@ -195,80 +221,83 @@ protected:
            }
            // Ignore optimized operations for "current" operation in our operation pair
-            while ((cur_type == onednn_post_op_type::optimized || cur_type == onednn_post_op_type::optimized_sum ||
+            while (type_is_any_optimized(cur_type) && cur_post_op_idx < post_ops_size - 1) {
                   cur_type == onednn_post_op_type::optimized_eltwise) && cur_post_op_idx < post_ops_size - 1) {
                cur_post_op_idx++;
                cur_type = cur_post_ops[cur_post_op_idx].op_type;
            }
            auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
            auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
            auto cur_type_is_optimized = cur_type == onednn_post_op_type::optimized ||
                                         cur_type == onednn_post_op_type::optimized_sum ||
                                         cur_type == onednn_post_op_type::optimized_eltwise;
            auto prev_type_is_optimized = prev_type == onednn_post_op_type::optimized ||
                                          prev_type == onednn_post_op_type::optimized_sum ||
                                          prev_type == onednn_post_op_type::optimized_eltwise;
            // If this is the last pair and it's optimized - add the last post-op and go out from the cycle
-            if (cur_post_op_idx == post_ops_size - 1 && (cur_type_is_optimized || prev_type_is_optimized)) {
+            if (cur_post_op_idx == post_ops_size - 1 && (type_is_any_optimized(cur_type) || type_is_any_optimized(prev_type))) {
-                if (!prev_type_is_optimized) {
+                if (!type_is_any_optimized(prev_type)) {
                    add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx);
                }
-                if (!cur_type_is_optimized) {
+                if (!type_is_any_optimized(cur_type)) {
                    add_post_op(cur_type, p_ops, optimized_p_ops, cur_idx);
                }
                break;
            }
            auto equal_ops = cur_type == prev_type;
            auto cur_type_is_binary_add_or_mul = cur_type == onednn_post_op_type::binary_add || cur_type == onednn_post_op_type::binary_mul;
            auto prev_type_is_binary_add_or_mul = prev_type == onednn_post_op_type::binary_add || prev_type == onednn_post_op_type::binary_mul;
            // Post-ops combinations which can be simplified
-            auto eltw_and_eltw = equal_ops && cur_type == onednn_post_op_type::eltwise_linear;
+            auto eltw_and_eltw  = type_is_eltwise(cur_type) && type_is_eltwise(prev_type);
-            auto bin_and_eltw = cur_type_is_binary_add_or_mul && prev_type == onednn_post_op_type::eltwise_linear;
+            auto bin_and_eltw   = type_is_binary_add_or_mul(cur_type) && type_is_eltwise_linear(prev_type);
-            auto eltw_and_bin = cur_type == onednn_post_op_type::eltwise_linear && prev_type_is_binary_add_or_mul;
+            auto eltw_and_bin   = type_is_eltwise_linear(cur_type) && type_is_binary_add_or_mul(prev_type);
-            auto eltw_and_sum = cur_type == onednn_post_op_type::eltwise_linear && prev_type == onednn_post_op_type::sum;
+            auto sum_and_eltw   = type_is_sum(cur_type) && type_is_eltwise(prev_type);
-            auto eltw_and_scale = cur_type == onednn_post_op_type::eltwise_linear && prev_type == onednn_post_op_type::scale;
+            auto eltw_and_scale = type_is_eltwise_linear(cur_type) && type_is_scale(prev_type);
            auto can_try_optimize = eltw_and_eltw ||
                                    bin_and_eltw ||
                                    eltw_and_bin ||
-                                    eltw_and_sum ||
+                                    sum_and_eltw ||
                                    eltw_and_scale;
            bool cur_ops_pair_is_optimized = false;
            if (can_try_optimize) {
                if (eltw_and_eltw) {
-                    dnnl::algorithm alg;
+                    dnnl::algorithm cur_alg, prev_alg;
                    float cur_scale, prev_scale, cur_alpha, prev_alpha, cur_beta, prev_beta;
-                    p_ops.get_params_eltwise(prev_idx, prev_scale, alg, prev_alpha, prev_beta);
+                    p_ops.get_params_eltwise(prev_idx, prev_scale, prev_alg, prev_alpha, prev_beta);
-                    p_ops.get_params_eltwise(cur_idx, cur_scale, alg, cur_alpha, cur_beta);
+                    p_ops.get_params_eltwise(cur_idx, cur_scale, cur_alg, cur_alpha, cur_beta);
-                    // Eltwise + eltwise pair can be optimized only if cur_alpha is equal to 1.0f
+                    auto eltw_linear_and_eltw_linear = type_is_eltwise_linear(cur_type) && type_is_eltwise_linear(prev_type);
-                    if (cur_alpha == 1.0f && prev_scale == cur_scale) {
+                    auto eltw_linear_and_eltw_non_linear = type_is_eltwise_linear(cur_type) && !type_is_eltwise_linear(prev_type) && cur_beta == 0;
                    // eltwise_linear + eltwise_linear combination can be optimized always
                    if (eltw_linear_and_eltw_linear) {
                        dnnl::post_ops eltw_p_op;
-                        eltw_p_op.append_eltwise(cur_scale, alg, prev_alpha, cur_beta + prev_beta);
+                        float optimized_alpha = cur_alpha * prev_alpha * prev_scale;
                        float optimized_beta = cur_alpha * prev_beta * prev_scale + cur_beta;
                        float optimized_scale = cur_scale;
                        eltw_p_op.append_eltwise(optimized_scale, cur_alg, optimized_alpha, optimized_beta);
                        // Combine 2 eltwises into one
                        add_post_op(cur_type, eltw_p_op, optimized_p_ops, 0);
                    } else if (eltw_linear_and_eltw_non_linear) {
                        dnnl::post_ops eltw_p_op;
                        eltw_p_op.append_eltwise(cur_scale * prev_scale * cur_alpha, prev_alg, prev_alpha, prev_beta);
                        // Combine 2 eltwises into one
                        add_post_op(prev_type, eltw_p_op, optimized_p_ops, 0);
                    }
                    if (eltw_linear_and_eltw_linear || eltw_linear_and_eltw_non_linear) {
                        // Marked current and previous eltwise operations as 'optimized' (they will be ignored on the next iteration of cycle)
                        cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized;
                        cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise;
                        // Set the flag if extra optimizations checking is needed
                        if (cur_post_op_idx < post_ops_size - 1) {
-                            if (cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::eltwise_linear ||
+                            if (type_is_eltwise_linear(cur_post_ops[cur_post_op_idx + 1].op_type) ||
-                                cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::binary_add ||
+                                type_is_binary_add_or_mul(cur_post_ops[cur_post_op_idx + 1].op_type) ||
-                                cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::binary_mul ||
+                                type_is_optimized_eltwise(cur_post_ops[cur_post_op_idx + 1].op_type)) {
                                cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::optimized_eltwise) {
                                optimization_is_completed = true;
                            }
                        }
                        cur_ops_pair_is_optimized = true;
                    }
                } else if (bin_and_eltw) {
@ -285,11 +314,13 @@ protected:
                    auto bin_ops_can_be_optimized = cur_node.is_type<data>() && cur_node.is_constant() &&
                                                    cur_node.get_users().size() == 1 && desc.data_type() == dnnl_f32;
-                    auto bin_add_and_eltw = alpha == 1.0f && scale == 1.0f && cur_type == onednn_post_op_type::binary_add && bin_ops_can_be_optimized;
+                    auto bin_add_and_eltw = alpha == 1.0f && scale == 1.0f && type_is_binary_add(cur_type) && bin_ops_can_be_optimized;
-                    auto bin_mul_and_eltw = beta == 0.f && scale == 1.0f && cur_type == onednn_post_op_type::binary_mul && bin_ops_can_be_optimized;
+                    auto bin_mul_and_eltw = beta == 0.f && type_is_binary_mul(cur_type) && bin_ops_can_be_optimized;
                    if (bin_add_and_eltw || bin_mul_and_eltw) {
                        memory::ptr cur_bin_mem_ptr = cur_node.as<data>().get_attached_memory_ptr();
                        if (cur_bin_mem_ptr == nullptr)
                            throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for bin + eltw");
                        auto& stream = cur_bin_mem_ptr->get_engine()->get_program_stream();
                        mem_lock<float, mem_lock_type::write> bin_and_eltw_lock(cur_bin_mem_ptr, stream);
@ -302,7 +333,7 @@ protected:
                            }
                        } else {
                            for (size_t data_idx = 0; data_idx < cur_bin_mem_size; data_idx++) {
-                                bin_and_eltw_lock[data_idx] *= alpha;
+                                bin_and_eltw_lock[data_idx] *= alpha * scale;
                            }
                        }
@ -325,11 +356,13 @@ protected:
                    auto bin_ops_can_be_optimized = prev_node.is_type<data>() && prev_node.is_constant() &&
                                                    prev_node.get_users().size() == 1 && desc.data_type() == dnnl_f32;
-                    auto eltw_and_bin_add = alpha == 1.0f && scale == 1.0f && prev_type == onednn_post_op_type::binary_add && bin_ops_can_be_optimized;
+                    auto eltw_and_bin_add = alpha == 1.0f && scale == 1.0f && type_is_binary_add(prev_type) && bin_ops_can_be_optimized;
-                    auto eltw_and_bin_mul = beta == 0.f && scale == 1.0f && prev_type == onednn_post_op_type::binary_mul && bin_ops_can_be_optimized;
+                    auto eltw_and_bin_mul = beta == 0.f && type_is_binary_mul(prev_type) && bin_ops_can_be_optimized;
                    if (eltw_and_bin_add || eltw_and_bin_mul) {
                        memory::ptr prev_bin_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
                        if (prev_bin_mem_ptr == nullptr)
                            throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + bin");
                        auto& stream = prev_bin_mem_ptr->get_engine()->get_program_stream();
                        mem_lock<float, mem_lock_type::write> eltw_and_bin_lock(prev_bin_mem_ptr, stream);
@ -342,7 +375,7 @@ protected:
                            }
                        } else {
                            for (size_t data_idx = 0; data_idx < prev_bin_mem_size; data_idx++) {
-                                eltw_and_bin_lock[data_idx] *= alpha;
+                                eltw_and_bin_lock[data_idx] *= alpha * scale;
                            }
                        }
@ -351,63 +384,69 @@ protected:
                        cur_ops_pair_is_optimized = true;
                    }
-                } else if (eltw_and_sum) {
+                } else if (sum_and_eltw) {
                    dnnl::algorithm alg;
-                    float cur_scale, prev_scale, alpha, beta;
+                    float sum_scale, eltw_scale, alpha, beta;
                    dnnl::memory::data_type data_type;
-                    cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep);
+                    dnnl::algorithm next_alg;
                    float next_scale, next_alpha, next_beta;
                    size_t next_idx = cur_idx + 1;
                    size_t next_post_op_idx = cur_post_op_idx + 1;
-                    p_ops.get_params_eltwise(cur_idx, cur_scale, alg, alpha, beta);
+                    bool can_optimize_eltw_and_sum = false;
                    p_ops.get_params_sum(prev_idx, prev_scale, data_type);
-                    // Eltwise operations can use runtime non-constant data buffers, so check that memory buffers consist of constant data only
+                    if (cur_post_op_idx < post_ops_size - 1) {
-                    auto eltw_ops_can_be_optimized = prev_node.is_type<data>() && prev_node.is_constant() &&
+                        auto next_type = cur_post_ops[next_post_op_idx].op_type;
-                                                     prev_node.get_users().size() == 1;
+                        if (type_is_eltwise_linear(next_type)) {
                            p_ops.get_params_eltwise(next_idx, next_scale, next_alg, next_alpha, next_beta);
-                    // Eltwise can be inserted into the scale field of previous sum if cur_beta is equal to 0.f
+                            if (next_beta == 0)
-                    if (beta == 0.f && cur_scale == 1.0f && eltw_ops_can_be_optimized) {
+                                can_optimize_eltw_and_sum = true;
                        dnnl::post_ops sum_p_op;
                        sum_p_op.append_sum(alpha * prev_scale, data_type);
                        // Insert cur eltwise into sum
                        add_post_op(prev_type, sum_p_op, optimized_p_ops, 0);
                        memory::ptr prev_eltw_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
                        auto& stream = prev_eltw_mem_ptr->get_engine()->get_program_stream();
                        mem_lock<float, mem_lock_type::write> eltw_and_sum_lock(prev_eltw_mem_ptr, stream);
                        size_t prev_eltw_mem_size = prev_node.get_output_layout().count();
                        // Also multiply sum on alpha for getting valid results
                        for (size_t data_idx = 0; data_idx < prev_eltw_mem_size; data_idx++) {
                            eltw_and_sum_lock[data_idx] *= alpha;
                        }
                    }
-                        // Marked current and previous operations as 'optimized' (they will be ignored on the next iteration of cycle)
+                    // Try to optimize eltwise (any) + sum + eltwise_linear (with beta = 0) chain of operations
-                        cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized;
+                    if (can_optimize_eltw_and_sum) {
-                        cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_sum;
+                        p_ops.get_params_sum(cur_idx, sum_scale, data_type);
                        p_ops.get_params_eltwise(prev_idx, eltw_scale, alg, alpha, beta);
                        dnnl::post_ops eltw_p_op_prev, sum_p_op;
                        eltw_p_op_prev.append_eltwise(eltw_scale * next_alpha * next_scale, alg, alpha, beta);
                        sum_p_op.append_sum(sum_scale * next_alpha, data_type);
                        add_post_op(prev_type, eltw_p_op_prev, optimized_p_ops, 0);
                        add_post_op(cur_type, sum_p_op, optimized_p_ops, 0);
                        // Marked current, previous and next operations as 'optimized' (they will be ignored on the next iteration of cycle)
                        cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise;
                        cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized_sum;
                        cur_post_ops[next_post_op_idx].op_type = onednn_post_op_type::optimized;
                        // Set the flag if extra optimizations checking is needed
-                        if (cur_post_op_idx < post_ops_size - 1) {
+                        if (next_post_op_idx < post_ops_size - 1) {
-                            if (cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::eltwise_linear ||
+                            if (type_is_eltwise_linear(cur_post_ops[next_post_op_idx + 1].op_type) ||
-                                cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::optimized_eltwise) {
+                                type_is_optimized_eltwise(cur_post_ops[next_post_op_idx + 1].op_type)) {
                                optimization_is_completed = true;
                            }
                        }
                        cur_ops_pair_is_optimized = true;
                    }
                } else if (eltw_and_scale) {
                    dnnl::algorithm alg;
-                    float cur_scale, alpha, beta;
+                    float eltw_scale, alpha, beta;
                    cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep);
-                    p_ops.get_params_eltwise(cur_idx, cur_scale, alg, alpha, beta);
+                    p_ops.get_params_eltwise(cur_idx, eltw_scale, alg, alpha, beta);
-                    // Eltwise can be inserted into output_scale if cur_beta is equal to 0.f and cur_scale is equal to 1.0f
+                    // Eltwise can be inserted into the output_scale if cur_beta is equal to 0.f
-                    if (beta == 0.f && cur_scale == 1.0f && prev_node.get_output_layout().data_type == data_types::f32) {
+                    if (beta == 0.f && prev_node.get_output_layout().data_type == data_types::f32) {
                        memory::ptr prev_scale_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
                        if (prev_scale_mem_ptr == nullptr)
                            throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + scale");
                        auto& stream = prev_scale_mem_ptr->get_engine()->get_program_stream();
                        mem_lock<float, mem_lock_type::write> eltw_and_scale_lock(prev_scale_mem_ptr, stream);
@ -415,7 +454,7 @@ protected:
                        // Update all scale coefficients
                        for (size_t data_idx = 0; data_idx < prev_scale_mem_size; data_idx++) {
-                            eltw_and_scale_lock[data_idx] *= alpha;
+                            eltw_and_scale_lock[data_idx] *= alpha * eltw_scale;
                        }
                        // Marked current eltwise operation as 'optimized' (it will be ignored on the next iteration of cycle)
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@ -9433,6 +9433,364 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish_onednn,
                                bc_test_params{CASE_CONV_S8S8_13, 2, 7},
                                bc_test_params{CASE_CONV_S8S8_15, 2, 7},
                        }));
 /* ----------------------------------------------------------------------------------------------------- */
 /* ------------------------------ OneDNN post-ops cases with optimizations ----------------------------- */
 /* ----------------------------------------------------------------------------------------------------- */
 // Before optimization: eltw_linear + eltw_linear
 // After optimization: eltw_linear
 // Limitations: no
 // DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_linear:1:-128
 // DNNL_VERBOSE log with optimization:    attr-post-ops:eltwise_linear:12.75:-0.5
 class post_ops_optimizations_onednn_eltw_linear_eltw_linear : public ConvFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_eltw_linear_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
                 data("weights", get_mem(get_weights_layout(p))),
                 data("bias", get_mem(get_bias_layout(p))),
                 data("in_lo", get_mem(get_single_element_layout(p), -10)),
                 data("in_hi", get_mem(get_single_element_layout(p), 10)),
                 data("out_lo", get_mem(get_single_element_layout(p), -128)),
                 data("out_hi", get_mem(get_single_element_layout(p), 127)),
                 convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
                 quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
                 reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
    );
    tolerance = 1.f;
    execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_linear_eltw_linear,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                // cases with batch = 1
                                bc_test_params{CASE_CONV_U8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_3, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_3, 2, 3},
                                // cases with batch = 16
                                bc_test_params{CASE_CONV_U8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_10, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_10, 2, 3},
                                // cases with batch = 32
                                bc_test_params{CASE_CONV_U8S8_11, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_15, 2, 3},
                        }));
 // Before optimization: eltw_non_linear + eltw_linear
 // After optimization: eltw_non_linear
 // Limitations: beta = 0 in eltw_linear
 // DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:2.00784+eltwise_clip:0:512
 // DNNL_VERBOSE log with optimization:    attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round:0:0:2.00784+eltwise_clip:0:512
 class post_ops_optimizations_onednn_eltw_non_linear_eltw_linear : public ConvFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
                 data("weights", get_mem(get_weights_layout(p))),
                 data("bias", get_mem(get_bias_layout(p))),
                 data("in_lo", get_mem(get_single_element_layout(p), -10)),
                 data("in_hi", get_mem(get_single_element_layout(p), 10)),
                 data("out_lo", get_mem(get_single_element_layout(p), 0)),
                 data("out_hi", get_mem(get_single_element_layout(p), 512)),
                 convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
                 quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::f32),
                 reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
    );
    tolerance = 1.f;
    execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_non_linear_eltw_linear,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                // cases with batch = 1
                                bc_test_params{CASE_CONV_U8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_3, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_3, 2, 3},
                                // cases with batch = 16
                                bc_test_params{CASE_CONV_U8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_10, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_10, 2, 3},
                                // cases with batch = 32
                                bc_test_params{CASE_CONV_U8S8_11, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_15, 2, 3},
                        }));
 // Before optimization: binary_add + eltw_linear
 // After optimization: binary_add
 // Limitations: alpha = 1 and scale = 1 in eltw_linear; binary_add is a constant compile-time buffer
 // DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_linear:1:-127+eltwise_clip:-127:127
 // DNNL_VERBOSE log with optimization:    attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_clip:-127:127
 class post_ops_optimizations_onednn_binary_add_eltw_linear : public ConvFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_binary_add_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
                 data("weights", get_mem(get_weights_layout(p))),
                 data("bias", get_mem(get_bias_layout(p))),
                 data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
                 data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
                 data("out_lo", get_mem(get_single_element_layout(p), -127)),
                 data("out_hi", get_mem(get_single_element_layout(p), 127)),
                 convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
                 quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
                 reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
    );
    tolerance = 1.f;
    execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_eltw_linear,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                // cases with batch = 1
                                bc_test_params{CASE_CONV_U8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_3, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_3, 2, 3},
                                // cases with batch = 16
                                bc_test_params{CASE_CONV_U8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_10, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_10, 2, 3},
                                // cases with batch = 32
                                bc_test_params{CASE_CONV_U8S8_11, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_15, 2, 3},
                        }));
 // Before optimization: binary_mul + eltw_linear
 // After optimization: binary_mul
 // Limitations: beta = 0 in eltw_linear; binary_mul is a constant compile-time buffer
 // DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_linear:2.01575+eltwise_clip:0:512
 // DNNL_VERBOSE log with optimization:    attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_clip:0:512
 class post_ops_optimizations_onednn_binary_mul_eltw_linear : public ConvFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_binary_mul_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
                 data("weights", get_mem(get_weights_layout(p))),
                 data("bias", get_mem(get_bias_layout(p))),
                 data("eltwise_data", get_mem(get_per_channel_layout(p), -1, 1)),
                 data("in_lo", get_mem(get_per_channel_layout(p), 0)),
                 data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
                 data("out_lo", get_mem(get_single_element_layout(p), 0)),
                 data("out_hi", get_mem(get_single_element_layout(p), 512)),
                 convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
                 eltwise("eltwise", { "conv_prim", "eltwise_data" }, eltwise_mode::prod),
                 quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
                 reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
    );
    tolerance = 1.f;
    execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_mul_eltw_linear,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                // cases with batch = 1
                                bc_test_params{CASE_CONV_U8S8_1, 2, 4},
                                bc_test_params{CASE_CONV_U8S8_2, 2, 4},
                                bc_test_params{CASE_CONV_U8S8_3, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_1, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_2, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_3, 2, 4},
                                // cases with batch = 16
                                bc_test_params{CASE_CONV_U8S8_9, 2, 4},
                                bc_test_params{CASE_CONV_U8S8_10, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_9, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_10, 2, 4},
                                // cases with batch = 32
                                bc_test_params{CASE_CONV_U8S8_11, 2, 4},
                                bc_test_params{CASE_CONV_U8S8_12, 2, 4},
                                bc_test_params{CASE_CONV_U8S8_13, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_12, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_13, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_14, 2, 4},
                        }));
 // Before optimization: o_scale + eltw_linear
 // After optimization: o_scale
 // Limitations: beta = 0 in eltw_linear
 // DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:eltwise_linear:2.01575+eltwise_clip:0:512
 // DNNL_VERBOSE log with optimization:    attr-oscale:2 attr-post-ops:eltwise_clip:0:512
 class post_ops_optimizations_onednn_oscale_eltw_linear : public ConvFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_oscale_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
                 data("weights", get_mem(get_weights_layout(p))),
                 data("bias", get_mem(get_bias_layout(p))),
                 data("in_lo", get_mem(get_per_channel_layout(p), 0)),
                 data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
                 data("out_lo", get_mem(get_single_element_layout(p), 0)),
                 data("out_hi", get_mem(get_single_element_layout(p), 512)),
                 convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
                 quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
                 reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
    );
    tolerance = 1.f;
    execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_oscale_eltw_linear,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                // cases with batch = 1
                                bc_test_params{CASE_CONV_U8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_3, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_3, 2, 3},
                                // cases with batch = 16
                                bc_test_params{CASE_CONV_U8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_10, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_10, 2, 3},
                                // cases with batch = 32
                                bc_test_params{CASE_CONV_U8S8_11, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_14, 2, 3},
                        }));
 // Before optimization: eltw_any + sum + eltw_linear
 // After optimization: eltw_any + sum
 // Limitations: beta = 0 in eltw_linear
 // DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_relu+sum:1:0:u8+eltwise_linear:12.7+eltwise_clip:0:127
 // DNNL_VERBOSE log with optimization:    attr-post-ops:eltwise_relu:0:0:12.7+sum:12.7:0:u8+eltwise_clip:0:127
 class post_ops_optimizations_onednn_eltw_any_sum_eltw_linear : public ConvFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
                 data("weights", get_mem(get_weights_layout(p))),
                 data("bias", get_mem(get_bias_layout(p))),
                 data("in_lo", get_mem(get_single_element_layout(p), 0)),
                 data("in_hi", get_mem(get_single_element_layout(p), 10)),
                 data("out_lo", get_mem(get_single_element_layout(p), 0)),
                 data("out_hi", get_mem(get_single_element_layout(p), 127)),
                 data("eltwise_data", get_mem(get_output_layout(p))),
                 convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
                 activation("activation", "conv_prim", activation_func::relu_negative_slope),
                 eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
                 quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 128, data_types::u8),
                 reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
    );
    tolerance = 1.f;
    execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_any_sum_eltw_linear,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                // cases with batch = 1
                                bc_test_params{CASE_CONV_U8S8_1, 2, 5},
                                bc_test_params{CASE_CONV_U8S8_2, 2, 5},
                                bc_test_params{CASE_CONV_U8S8_3, 2, 5},
                                bc_test_params{CASE_CONV_S8S8_1, 2, 5},
                                bc_test_params{CASE_CONV_S8S8_2, 2, 5},
                                bc_test_params{CASE_CONV_S8S8_3, 2, 5},
                                // cases with batch = 16
                                bc_test_params{CASE_CONV_U8S8_10, 2, 5},
                                bc_test_params{CASE_CONV_S8S8_10, 2, 5},
                                // cases with batch = 32
                                bc_test_params{CASE_CONV_U8S8_11, 2, 5},
                                bc_test_params{CASE_CONV_U8S8_12, 2, 5},
                                bc_test_params{CASE_CONV_U8S8_13, 2, 5},
                                bc_test_params{CASE_CONV_U8S8_14, 2, 5},
                                bc_test_params{CASE_CONV_S8S8_12, 2, 5},
                                bc_test_params{CASE_CONV_S8S8_13, 2, 5},
                                bc_test_params{CASE_CONV_S8S8_14, 2, 5},
                                bc_test_params{CASE_CONV_S8S8_15, 2, 5},
                        }));
 // Input range uses in 2 cases: not per-tensor output range or out_lo > out_hi
 // Here's out_lo > out_hi and no optimizations
 // DNNL_VERBOSE log: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:-1:127
 class post_ops_optimizations_input_range : public ConvFusingTestOneDNN {};
 TEST_P(post_ops_optimizations_input_range, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
                 data("weights", get_mem(get_weights_layout(p))),
                 data("bias", get_mem(get_bias_layout(p))),
                 data("in_lo", get_mem(get_single_element_layout(p), -10)),
                 data("in_hi", get_mem(get_single_element_layout(p), 10)),
                 data("out_lo", get_mem(get_single_element_layout(p), 127)),
                 data("out_hi", get_mem(get_single_element_layout(p), -128)),
                 convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
                 quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
                 reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
    );
    tolerance = 1.f;
    execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                // cases with batch = 1
                                bc_test_params{CASE_CONV_U8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_3, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_1, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_2, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_3, 2, 3},
                                // cases with batch = 16
                                bc_test_params{CASE_CONV_U8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_10, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_9, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_10, 2, 3},
                                // cases with batch = 32
                                bc_test_params{CASE_CONV_U8S8_11, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_U8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_12, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_13, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_14, 2, 3},
                                bc_test_params{CASE_CONV_S8S8_15, 2, 3},
                        }));
 #endif