diff --git a/inference-engine/thirdparty/clDNN/src/impls/onednn/primitive_onednn_base.h b/inference-engine/thirdparty/clDNN/src/impls/onednn/primitive_onednn_base.h index bce27bc2cfd..10748a4c68b 100644 --- a/inference-engine/thirdparty/clDNN/src/impls/onednn/primitive_onednn_base.h +++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/primitive_onednn_base.h @@ -161,6 +161,33 @@ protected: } }; + // Check that post-op type is any optimized + auto type_is_any_optimized = [](onednn_post_op_type type) -> bool { + return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum || + type == onednn_post_op_type::optimized_eltwise; + }; + + // Check that post-op type is eltwise + auto type_is_eltwise = [](onednn_post_op_type type) -> bool { + return type == onednn_post_op_type::eltwise_round || type == onednn_post_op_type::eltwise_linear || + type == onednn_post_op_type::eltwise_clip || type == onednn_post_op_type::eltwise_act; + }; + + // Check that post-op type is binary_add or binary_mul + auto type_is_binary_add_or_mul = [](onednn_post_op_type type) -> bool { + return type == onednn_post_op_type::binary_add || type == onednn_post_op_type::binary_mul; + }; + + // Simple post-op type checks + auto type_is_optimized = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized; }; + auto type_is_eltwise_linear = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::eltwise_linear; }; + auto type_is_optimized_eltwise = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_eltwise; }; + auto type_is_binary_add = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_add; }; + auto type_is_binary_mul = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_mul; }; + auto type_is_sum = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::sum; }; + auto type_is_optimized_sum = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_sum; }; + auto type_is_scale = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::scale; }; + auto& cur_post_ops = onednn_fusing_map[node_id]; size_t cur_post_op_idx = 1; @@ -169,11 +196,11 @@ protected: // Check and update post-op map if we already optimized something for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) { - if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized_sum) + if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type)) cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum; - else if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized_eltwise) + else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type)) cur_post_ops[post_op_idx].op_type = onednn_post_op_type::eltwise_linear; - else if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized) + else if (type_is_optimized(cur_post_ops[post_op_idx].op_type)) cur_post_ops.erase(cur_post_ops.begin() + post_op_idx); } @@ -186,8 +213,7 @@ protected: auto prev_type = cur_post_ops[prev_post_op_idx].op_type; // Ignore optimized operations for "previous" operation in our operation pair - while ((prev_type == onednn_post_op_type::optimized || prev_type == onednn_post_op_type::optimized_sum || - prev_type == onednn_post_op_type::optimized_eltwise) && cur_post_op_idx < post_ops_size - 1) { + while (type_is_any_optimized(prev_type) && cur_post_op_idx < post_ops_size - 1) { prev_post_op_idx++; cur_post_op_idx++; prev_type = cur_post_ops[prev_post_op_idx].op_type; @@ -195,80 +221,83 @@ protected: } // Ignore optimized operations for "current" operation in our operation pair - while ((cur_type == onednn_post_op_type::optimized || cur_type == onednn_post_op_type::optimized_sum || - cur_type == onednn_post_op_type::optimized_eltwise) && cur_post_op_idx < post_ops_size - 1) { + while (type_is_any_optimized(cur_type) && cur_post_op_idx < post_ops_size - 1) { cur_post_op_idx++; cur_type = cur_post_ops[cur_post_op_idx].op_type; } auto cur_idx = static_cast(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx); auto prev_idx = static_cast(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx); - auto cur_type_is_optimized = cur_type == onednn_post_op_type::optimized || - cur_type == onednn_post_op_type::optimized_sum || - cur_type == onednn_post_op_type::optimized_eltwise; - auto prev_type_is_optimized = prev_type == onednn_post_op_type::optimized || - prev_type == onednn_post_op_type::optimized_sum || - prev_type == onednn_post_op_type::optimized_eltwise; // If this is the last pair and it's optimized - add the last post-op and go out from the cycle - if (cur_post_op_idx == post_ops_size - 1 && (cur_type_is_optimized || prev_type_is_optimized)) { - if (!prev_type_is_optimized) { + if (cur_post_op_idx == post_ops_size - 1 && (type_is_any_optimized(cur_type) || type_is_any_optimized(prev_type))) { + if (!type_is_any_optimized(prev_type)) { add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx); } - if (!cur_type_is_optimized) { + if (!type_is_any_optimized(cur_type)) { add_post_op(cur_type, p_ops, optimized_p_ops, cur_idx); } break; } - auto equal_ops = cur_type == prev_type; - auto cur_type_is_binary_add_or_mul = cur_type == onednn_post_op_type::binary_add || cur_type == onednn_post_op_type::binary_mul; - auto prev_type_is_binary_add_or_mul = prev_type == onednn_post_op_type::binary_add || prev_type == onednn_post_op_type::binary_mul; - // Post-ops combinations which can be simplified - auto eltw_and_eltw = equal_ops && cur_type == onednn_post_op_type::eltwise_linear; - auto bin_and_eltw = cur_type_is_binary_add_or_mul && prev_type == onednn_post_op_type::eltwise_linear; - auto eltw_and_bin = cur_type == onednn_post_op_type::eltwise_linear && prev_type_is_binary_add_or_mul; - auto eltw_and_sum = cur_type == onednn_post_op_type::eltwise_linear && prev_type == onednn_post_op_type::sum; - auto eltw_and_scale = cur_type == onednn_post_op_type::eltwise_linear && prev_type == onednn_post_op_type::scale; + auto eltw_and_eltw = type_is_eltwise(cur_type) && type_is_eltwise(prev_type); + auto bin_and_eltw = type_is_binary_add_or_mul(cur_type) && type_is_eltwise_linear(prev_type); + auto eltw_and_bin = type_is_eltwise_linear(cur_type) && type_is_binary_add_or_mul(prev_type); + auto sum_and_eltw = type_is_sum(cur_type) && type_is_eltwise(prev_type); + auto eltw_and_scale = type_is_eltwise_linear(cur_type) && type_is_scale(prev_type); auto can_try_optimize = eltw_and_eltw || bin_and_eltw || eltw_and_bin || - eltw_and_sum || + sum_and_eltw || eltw_and_scale; bool cur_ops_pair_is_optimized = false; if (can_try_optimize) { if (eltw_and_eltw) { - dnnl::algorithm alg; + dnnl::algorithm cur_alg, prev_alg; float cur_scale, prev_scale, cur_alpha, prev_alpha, cur_beta, prev_beta; - p_ops.get_params_eltwise(prev_idx, prev_scale, alg, prev_alpha, prev_beta); - p_ops.get_params_eltwise(cur_idx, cur_scale, alg, cur_alpha, cur_beta); + p_ops.get_params_eltwise(prev_idx, prev_scale, prev_alg, prev_alpha, prev_beta); + p_ops.get_params_eltwise(cur_idx, cur_scale, cur_alg, cur_alpha, cur_beta); - // Eltwise + eltwise pair can be optimized only if cur_alpha is equal to 1.0f - if (cur_alpha == 1.0f && prev_scale == cur_scale) { + auto eltw_linear_and_eltw_linear = type_is_eltwise_linear(cur_type) && type_is_eltwise_linear(prev_type); + auto eltw_linear_and_eltw_non_linear = type_is_eltwise_linear(cur_type) && !type_is_eltwise_linear(prev_type) && cur_beta == 0; + + // eltwise_linear + eltwise_linear combination can be optimized always + if (eltw_linear_and_eltw_linear) { dnnl::post_ops eltw_p_op; - eltw_p_op.append_eltwise(cur_scale, alg, prev_alpha, cur_beta + prev_beta); + float optimized_alpha = cur_alpha * prev_alpha * prev_scale; + float optimized_beta = cur_alpha * prev_beta * prev_scale + cur_beta; + float optimized_scale = cur_scale; + eltw_p_op.append_eltwise(optimized_scale, cur_alg, optimized_alpha, optimized_beta); // Combine 2 eltwises into one add_post_op(cur_type, eltw_p_op, optimized_p_ops, 0); + } else if (eltw_linear_and_eltw_non_linear) { + dnnl::post_ops eltw_p_op; + eltw_p_op.append_eltwise(cur_scale * prev_scale * cur_alpha, prev_alg, prev_alpha, prev_beta); + // Combine 2 eltwises into one + add_post_op(prev_type, eltw_p_op, optimized_p_ops, 0); + } + + if (eltw_linear_and_eltw_linear || eltw_linear_and_eltw_non_linear) { // Marked current and previous eltwise operations as 'optimized' (they will be ignored on the next iteration of cycle) cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized; cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise; // Set the flag if extra optimizations checking is needed if (cur_post_op_idx < post_ops_size - 1) { - if (cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::eltwise_linear || - cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::binary_add || - cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::binary_mul || - cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::optimized_eltwise) { + if (type_is_eltwise_linear(cur_post_ops[cur_post_op_idx + 1].op_type) || + type_is_binary_add_or_mul(cur_post_ops[cur_post_op_idx + 1].op_type) || + type_is_optimized_eltwise(cur_post_ops[cur_post_op_idx + 1].op_type)) { optimization_is_completed = true; } } + cur_ops_pair_is_optimized = true; } } else if (bin_and_eltw) { @@ -285,11 +314,13 @@ protected: auto bin_ops_can_be_optimized = cur_node.is_type() && cur_node.is_constant() && cur_node.get_users().size() == 1 && desc.data_type() == dnnl_f32; - auto bin_add_and_eltw = alpha == 1.0f && scale == 1.0f && cur_type == onednn_post_op_type::binary_add && bin_ops_can_be_optimized; - auto bin_mul_and_eltw = beta == 0.f && scale == 1.0f && cur_type == onednn_post_op_type::binary_mul && bin_ops_can_be_optimized; + auto bin_add_and_eltw = alpha == 1.0f && scale == 1.0f && type_is_binary_add(cur_type) && bin_ops_can_be_optimized; + auto bin_mul_and_eltw = beta == 0.f && type_is_binary_mul(cur_type) && bin_ops_can_be_optimized; if (bin_add_and_eltw || bin_mul_and_eltw) { memory::ptr cur_bin_mem_ptr = cur_node.as().get_attached_memory_ptr(); + if (cur_bin_mem_ptr == nullptr) + throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for bin + eltw"); auto& stream = cur_bin_mem_ptr->get_engine()->get_program_stream(); mem_lock bin_and_eltw_lock(cur_bin_mem_ptr, stream); @@ -302,7 +333,7 @@ protected: } } else { for (size_t data_idx = 0; data_idx < cur_bin_mem_size; data_idx++) { - bin_and_eltw_lock[data_idx] *= alpha; + bin_and_eltw_lock[data_idx] *= alpha * scale; } } @@ -325,11 +356,13 @@ protected: auto bin_ops_can_be_optimized = prev_node.is_type() && prev_node.is_constant() && prev_node.get_users().size() == 1 && desc.data_type() == dnnl_f32; - auto eltw_and_bin_add = alpha == 1.0f && scale == 1.0f && prev_type == onednn_post_op_type::binary_add && bin_ops_can_be_optimized; - auto eltw_and_bin_mul = beta == 0.f && scale == 1.0f && prev_type == onednn_post_op_type::binary_mul && bin_ops_can_be_optimized; + auto eltw_and_bin_add = alpha == 1.0f && scale == 1.0f && type_is_binary_add(prev_type) && bin_ops_can_be_optimized; + auto eltw_and_bin_mul = beta == 0.f && type_is_binary_mul(prev_type) && bin_ops_can_be_optimized; if (eltw_and_bin_add || eltw_and_bin_mul) { memory::ptr prev_bin_mem_ptr = prev_node.as().get_attached_memory_ptr(); + if (prev_bin_mem_ptr == nullptr) + throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + bin"); auto& stream = prev_bin_mem_ptr->get_engine()->get_program_stream(); mem_lock eltw_and_bin_lock(prev_bin_mem_ptr, stream); @@ -342,7 +375,7 @@ protected: } } else { for (size_t data_idx = 0; data_idx < prev_bin_mem_size; data_idx++) { - eltw_and_bin_lock[data_idx] *= alpha; + eltw_and_bin_lock[data_idx] *= alpha * scale; } } @@ -351,63 +384,69 @@ protected: cur_ops_pair_is_optimized = true; } - } else if (eltw_and_sum) { + } else if (sum_and_eltw) { dnnl::algorithm alg; - float cur_scale, prev_scale, alpha, beta; + float sum_scale, eltw_scale, alpha, beta; dnnl::memory::data_type data_type; - cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep); + dnnl::algorithm next_alg; + float next_scale, next_alpha, next_beta; + size_t next_idx = cur_idx + 1; + size_t next_post_op_idx = cur_post_op_idx + 1; - p_ops.get_params_eltwise(cur_idx, cur_scale, alg, alpha, beta); - p_ops.get_params_sum(prev_idx, prev_scale, data_type); + bool can_optimize_eltw_and_sum = false; - // Eltwise operations can use runtime non-constant data buffers, so check that memory buffers consist of constant data only - auto eltw_ops_can_be_optimized = prev_node.is_type() && prev_node.is_constant() && - prev_node.get_users().size() == 1; + if (cur_post_op_idx < post_ops_size - 1) { + auto next_type = cur_post_ops[next_post_op_idx].op_type; + if (type_is_eltwise_linear(next_type)) { + p_ops.get_params_eltwise(next_idx, next_scale, next_alg, next_alpha, next_beta); - // Eltwise can be inserted into the scale field of previous sum if cur_beta is equal to 0.f - if (beta == 0.f && cur_scale == 1.0f && eltw_ops_can_be_optimized) { - dnnl::post_ops sum_p_op; - sum_p_op.append_sum(alpha * prev_scale, data_type); - - // Insert cur eltwise into sum - add_post_op(prev_type, sum_p_op, optimized_p_ops, 0); - - memory::ptr prev_eltw_mem_ptr = prev_node.as().get_attached_memory_ptr(); - auto& stream = prev_eltw_mem_ptr->get_engine()->get_program_stream(); - mem_lock eltw_and_sum_lock(prev_eltw_mem_ptr, stream); - - size_t prev_eltw_mem_size = prev_node.get_output_layout().count(); - - // Also multiply sum on alpha for getting valid results - for (size_t data_idx = 0; data_idx < prev_eltw_mem_size; data_idx++) { - eltw_and_sum_lock[data_idx] *= alpha; + if (next_beta == 0) + can_optimize_eltw_and_sum = true; } + } - // Marked current and previous operations as 'optimized' (they will be ignored on the next iteration of cycle) - cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized; - cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_sum; + // Try to optimize eltwise (any) + sum + eltwise_linear (with beta = 0) chain of operations + if (can_optimize_eltw_and_sum) { + p_ops.get_params_sum(cur_idx, sum_scale, data_type); + p_ops.get_params_eltwise(prev_idx, eltw_scale, alg, alpha, beta); + + dnnl::post_ops eltw_p_op_prev, sum_p_op; + + eltw_p_op_prev.append_eltwise(eltw_scale * next_alpha * next_scale, alg, alpha, beta); + sum_p_op.append_sum(sum_scale * next_alpha, data_type); + + add_post_op(prev_type, eltw_p_op_prev, optimized_p_ops, 0); + add_post_op(cur_type, sum_p_op, optimized_p_ops, 0); + + // Marked current, previous and next operations as 'optimized' (they will be ignored on the next iteration of cycle) + cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise; + cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized_sum; + cur_post_ops[next_post_op_idx].op_type = onednn_post_op_type::optimized; // Set the flag if extra optimizations checking is needed - if (cur_post_op_idx < post_ops_size - 1) { - if (cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::eltwise_linear || - cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::optimized_eltwise) { + if (next_post_op_idx < post_ops_size - 1) { + if (type_is_eltwise_linear(cur_post_ops[next_post_op_idx + 1].op_type) || + type_is_optimized_eltwise(cur_post_ops[next_post_op_idx + 1].op_type)) { optimization_is_completed = true; } } + cur_ops_pair_is_optimized = true; } } else if (eltw_and_scale) { dnnl::algorithm alg; - float cur_scale, alpha, beta; + float eltw_scale, alpha, beta; cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep); - p_ops.get_params_eltwise(cur_idx, cur_scale, alg, alpha, beta); + p_ops.get_params_eltwise(cur_idx, eltw_scale, alg, alpha, beta); - // Eltwise can be inserted into output_scale if cur_beta is equal to 0.f and cur_scale is equal to 1.0f - if (beta == 0.f && cur_scale == 1.0f && prev_node.get_output_layout().data_type == data_types::f32) { + // Eltwise can be inserted into the output_scale if cur_beta is equal to 0.f + if (beta == 0.f && prev_node.get_output_layout().data_type == data_types::f32) { memory::ptr prev_scale_mem_ptr = prev_node.as().get_attached_memory_ptr(); + if (prev_scale_mem_ptr == nullptr) + throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + scale"); auto& stream = prev_scale_mem_ptr->get_engine()->get_program_stream(); mem_lock eltw_and_scale_lock(prev_scale_mem_ptr, stream); @@ -415,7 +454,7 @@ protected: // Update all scale coefficients for (size_t data_idx = 0; data_idx < prev_scale_mem_size; data_idx++) { - eltw_and_scale_lock[data_idx] *= alpha; + eltw_and_scale_lock[data_idx] *= alpha * eltw_scale; } // Marked current eltwise operation as 'optimized' (it will be ignored on the next iteration of cycle) diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index 0089df16947..888c1cfe161 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -9433,6 +9433,364 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish_onednn, bc_test_params{CASE_CONV_S8S8_13, 2, 7}, bc_test_params{CASE_CONV_S8S8_15, 2, 7}, })); + +/* ----------------------------------------------------------------------------------------------------- */ +/* ------------------------------ OneDNN post-ops cases with optimizations ----------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +// Before optimization: eltw_linear + eltw_linear +// After optimization: eltw_linear +// Limitations: no +// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_linear:1:-128 +// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:-0.5 +class post_ops_optimizations_onednn_eltw_linear_eltw_linear : public ConvFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_eltw_linear_eltw_linear, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), -10)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), -128)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_linear_eltw_linear, + ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + bc_test_params{CASE_CONV_U8S8_1, 2, 3}, + bc_test_params{CASE_CONV_U8S8_2, 2, 3}, + bc_test_params{CASE_CONV_U8S8_3, 2, 3}, + bc_test_params{CASE_CONV_S8S8_1, 2, 3}, + bc_test_params{CASE_CONV_S8S8_2, 2, 3}, + bc_test_params{CASE_CONV_S8S8_3, 2, 3}, + + // cases with batch = 16 + bc_test_params{CASE_CONV_U8S8_9, 2, 3}, + bc_test_params{CASE_CONV_U8S8_10, 2, 3}, + bc_test_params{CASE_CONV_S8S8_9, 2, 3}, + bc_test_params{CASE_CONV_S8S8_10, 2, 3}, + + // cases with batch = 32 + bc_test_params{CASE_CONV_U8S8_11, 2, 3}, + bc_test_params{CASE_CONV_U8S8_12, 2, 3}, + bc_test_params{CASE_CONV_U8S8_13, 2, 3}, + bc_test_params{CASE_CONV_U8S8_14, 2, 3}, + bc_test_params{CASE_CONV_S8S8_12, 2, 3}, + bc_test_params{CASE_CONV_S8S8_13, 2, 3}, + bc_test_params{CASE_CONV_S8S8_14, 2, 3}, + bc_test_params{CASE_CONV_S8S8_15, 2, 3}, + })); + +// Before optimization: eltw_non_linear + eltw_linear +// After optimization: eltw_non_linear +// Limitations: beta = 0 in eltw_linear +// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:2.00784+eltwise_clip:0:512 +// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round:0:0:2.00784+eltwise_clip:0:512 +class post_ops_optimizations_onednn_eltw_non_linear_eltw_linear : public ConvFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), -10)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 512)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::f32), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, + ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + bc_test_params{CASE_CONV_U8S8_1, 2, 3}, + bc_test_params{CASE_CONV_U8S8_2, 2, 3}, + bc_test_params{CASE_CONV_U8S8_3, 2, 3}, + bc_test_params{CASE_CONV_S8S8_1, 2, 3}, + bc_test_params{CASE_CONV_S8S8_2, 2, 3}, + bc_test_params{CASE_CONV_S8S8_3, 2, 3}, + + // cases with batch = 16 + bc_test_params{CASE_CONV_U8S8_9, 2, 3}, + bc_test_params{CASE_CONV_U8S8_10, 2, 3}, + bc_test_params{CASE_CONV_S8S8_9, 2, 3}, + bc_test_params{CASE_CONV_S8S8_10, 2, 3}, + + // cases with batch = 32 + bc_test_params{CASE_CONV_U8S8_11, 2, 3}, + bc_test_params{CASE_CONV_U8S8_12, 2, 3}, + bc_test_params{CASE_CONV_U8S8_13, 2, 3}, + bc_test_params{CASE_CONV_U8S8_14, 2, 3}, + bc_test_params{CASE_CONV_S8S8_12, 2, 3}, + bc_test_params{CASE_CONV_S8S8_13, 2, 3}, + bc_test_params{CASE_CONV_S8S8_14, 2, 3}, + bc_test_params{CASE_CONV_S8S8_15, 2, 3}, + })); + +// Before optimization: binary_add + eltw_linear +// After optimization: binary_add +// Limitations: alpha = 1 and scale = 1 in eltw_linear; binary_add is a constant compile-time buffer +// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_linear:1:-127+eltwise_clip:-127:127 +// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_clip:-127:127 +class post_ops_optimizations_onednn_binary_add_eltw_linear : public ConvFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_binary_add_eltw_linear, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_eltw_linear, + ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + bc_test_params{CASE_CONV_U8S8_1, 2, 3}, + bc_test_params{CASE_CONV_U8S8_2, 2, 3}, + bc_test_params{CASE_CONV_U8S8_3, 2, 3}, + bc_test_params{CASE_CONV_S8S8_1, 2, 3}, + bc_test_params{CASE_CONV_S8S8_2, 2, 3}, + bc_test_params{CASE_CONV_S8S8_3, 2, 3}, + + // cases with batch = 16 + bc_test_params{CASE_CONV_U8S8_9, 2, 3}, + bc_test_params{CASE_CONV_U8S8_10, 2, 3}, + bc_test_params{CASE_CONV_S8S8_9, 2, 3}, + bc_test_params{CASE_CONV_S8S8_10, 2, 3}, + + // cases with batch = 32 + bc_test_params{CASE_CONV_U8S8_11, 2, 3}, + bc_test_params{CASE_CONV_U8S8_12, 2, 3}, + bc_test_params{CASE_CONV_U8S8_13, 2, 3}, + bc_test_params{CASE_CONV_U8S8_14, 2, 3}, + bc_test_params{CASE_CONV_S8S8_12, 2, 3}, + bc_test_params{CASE_CONV_S8S8_13, 2, 3}, + bc_test_params{CASE_CONV_S8S8_14, 2, 3}, + bc_test_params{CASE_CONV_S8S8_15, 2, 3}, + })); + +// Before optimization: binary_mul + eltw_linear +// After optimization: binary_mul +// Limitations: beta = 0 in eltw_linear; binary_mul is a constant compile-time buffer +// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_linear:2.01575+eltwise_clip:0:512 +// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_clip:0:512 +class post_ops_optimizations_onednn_binary_mul_eltw_linear : public ConvFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_binary_mul_eltw_linear, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_per_channel_layout(p), -1, 1)), + data("in_lo", get_mem(get_per_channel_layout(p), 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 512)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise", { "conv_prim", "eltwise_data" }, eltwise_mode::prod), + quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_mul_eltw_linear, + ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + bc_test_params{CASE_CONV_U8S8_1, 2, 4}, + bc_test_params{CASE_CONV_U8S8_2, 2, 4}, + bc_test_params{CASE_CONV_U8S8_3, 2, 4}, + bc_test_params{CASE_CONV_S8S8_1, 2, 4}, + bc_test_params{CASE_CONV_S8S8_2, 2, 4}, + bc_test_params{CASE_CONV_S8S8_3, 2, 4}, + + // cases with batch = 16 + bc_test_params{CASE_CONV_U8S8_9, 2, 4}, + bc_test_params{CASE_CONV_U8S8_10, 2, 4}, + bc_test_params{CASE_CONV_S8S8_9, 2, 4}, + bc_test_params{CASE_CONV_S8S8_10, 2, 4}, + + // cases with batch = 32 + bc_test_params{CASE_CONV_U8S8_11, 2, 4}, + bc_test_params{CASE_CONV_U8S8_12, 2, 4}, + bc_test_params{CASE_CONV_U8S8_13, 2, 4}, + bc_test_params{CASE_CONV_S8S8_12, 2, 4}, + bc_test_params{CASE_CONV_S8S8_13, 2, 4}, + bc_test_params{CASE_CONV_S8S8_14, 2, 4}, + })); + +// Before optimization: o_scale + eltw_linear +// After optimization: o_scale +// Limitations: beta = 0 in eltw_linear +// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:eltwise_linear:2.01575+eltwise_clip:0:512 +// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:eltwise_clip:0:512 +class post_ops_optimizations_onednn_oscale_eltw_linear : public ConvFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_oscale_eltw_linear, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 512)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_oscale_eltw_linear, + ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + bc_test_params{CASE_CONV_U8S8_1, 2, 3}, + bc_test_params{CASE_CONV_U8S8_2, 2, 3}, + bc_test_params{CASE_CONV_U8S8_3, 2, 3}, + bc_test_params{CASE_CONV_S8S8_1, 2, 3}, + bc_test_params{CASE_CONV_S8S8_2, 2, 3}, + bc_test_params{CASE_CONV_S8S8_3, 2, 3}, + + // cases with batch = 16 + bc_test_params{CASE_CONV_U8S8_9, 2, 3}, + bc_test_params{CASE_CONV_U8S8_10, 2, 3}, + bc_test_params{CASE_CONV_S8S8_9, 2, 3}, + bc_test_params{CASE_CONV_S8S8_10, 2, 3}, + + // cases with batch = 32 + bc_test_params{CASE_CONV_U8S8_11, 2, 3}, + bc_test_params{CASE_CONV_U8S8_12, 2, 3}, + bc_test_params{CASE_CONV_U8S8_13, 2, 3}, + bc_test_params{CASE_CONV_S8S8_12, 2, 3}, + bc_test_params{CASE_CONV_S8S8_13, 2, 3}, + bc_test_params{CASE_CONV_S8S8_14, 2, 3}, + })); + +// Before optimization: eltw_any + sum + eltw_linear +// After optimization: eltw_any + sum +// Limitations: beta = 0 in eltw_linear +// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_relu+sum:1:0:u8+eltwise_linear:12.7+eltwise_clip:0:127 +// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_relu:0:0:12.7+sum:12.7:0:u8+eltwise_clip:0:127 +class post_ops_optimizations_onednn_eltw_any_sum_eltw_linear : public ConvFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), 0)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::relu_negative_slope), + eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum), + quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 128, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, + ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + bc_test_params{CASE_CONV_U8S8_1, 2, 5}, + bc_test_params{CASE_CONV_U8S8_2, 2, 5}, + bc_test_params{CASE_CONV_U8S8_3, 2, 5}, + bc_test_params{CASE_CONV_S8S8_1, 2, 5}, + bc_test_params{CASE_CONV_S8S8_2, 2, 5}, + bc_test_params{CASE_CONV_S8S8_3, 2, 5}, + + // cases with batch = 16 + bc_test_params{CASE_CONV_U8S8_10, 2, 5}, + bc_test_params{CASE_CONV_S8S8_10, 2, 5}, + + // cases with batch = 32 + bc_test_params{CASE_CONV_U8S8_11, 2, 5}, + bc_test_params{CASE_CONV_U8S8_12, 2, 5}, + bc_test_params{CASE_CONV_U8S8_13, 2, 5}, + bc_test_params{CASE_CONV_U8S8_14, 2, 5}, + bc_test_params{CASE_CONV_S8S8_12, 2, 5}, + bc_test_params{CASE_CONV_S8S8_13, 2, 5}, + bc_test_params{CASE_CONV_S8S8_14, 2, 5}, + bc_test_params{CASE_CONV_S8S8_15, 2, 5}, + })); + +// Input range uses in 2 cases: not per-tensor output range or out_lo > out_hi +// Here's out_lo > out_hi and no optimizations +// DNNL_VERBOSE log: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:-1:127 +class post_ops_optimizations_input_range : public ConvFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_input_range, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), -10)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), 127)), + data("out_hi", get_mem(get_single_element_layout(p), -128)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range, + ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + bc_test_params{CASE_CONV_U8S8_1, 2, 3}, + bc_test_params{CASE_CONV_U8S8_2, 2, 3}, + bc_test_params{CASE_CONV_U8S8_3, 2, 3}, + bc_test_params{CASE_CONV_S8S8_1, 2, 3}, + bc_test_params{CASE_CONV_S8S8_2, 2, 3}, + bc_test_params{CASE_CONV_S8S8_3, 2, 3}, + + // cases with batch = 16 + bc_test_params{CASE_CONV_U8S8_9, 2, 3}, + bc_test_params{CASE_CONV_U8S8_10, 2, 3}, + bc_test_params{CASE_CONV_S8S8_9, 2, 3}, + bc_test_params{CASE_CONV_S8S8_10, 2, 3}, + + // cases with batch = 32 + bc_test_params{CASE_CONV_U8S8_11, 2, 3}, + bc_test_params{CASE_CONV_U8S8_12, 2, 3}, + bc_test_params{CASE_CONV_U8S8_13, 2, 3}, + bc_test_params{CASE_CONV_U8S8_14, 2, 3}, + bc_test_params{CASE_CONV_S8S8_12, 2, 3}, + bc_test_params{CASE_CONV_S8S8_13, 2, 3}, + bc_test_params{CASE_CONV_S8S8_14, 2, 3}, + bc_test_params{CASE_CONV_S8S8_15, 2, 3}, + })); #endif