[GPU] New OneDNN post-ops optimizations and fusing tests (#8056)

* [GPU] New OneDNN post-ops optimizations and fusing tests

* [GPU] Code refactoring by github comments

Failing test disabled in #8109
This commit is contained in:
Ilya Znamenskiy 2021-10-28 09:30:08 +03:00 committed by GitHub
parent 6f862822e1
commit 262c87c6e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 475 additions and 78 deletions

View File

@ -161,6 +161,33 @@ protected:
} }
}; };
// Check that post-op type is any optimized
auto type_is_any_optimized = [](onednn_post_op_type type) -> bool {
return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum ||
type == onednn_post_op_type::optimized_eltwise;
};
// Check that post-op type is eltwise
auto type_is_eltwise = [](onednn_post_op_type type) -> bool {
return type == onednn_post_op_type::eltwise_round || type == onednn_post_op_type::eltwise_linear ||
type == onednn_post_op_type::eltwise_clip || type == onednn_post_op_type::eltwise_act;
};
// Check that post-op type is binary_add or binary_mul
auto type_is_binary_add_or_mul = [](onednn_post_op_type type) -> bool {
return type == onednn_post_op_type::binary_add || type == onednn_post_op_type::binary_mul;
};
// Simple post-op type checks
auto type_is_optimized = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized; };
auto type_is_eltwise_linear = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::eltwise_linear; };
auto type_is_optimized_eltwise = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_eltwise; };
auto type_is_binary_add = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_add; };
auto type_is_binary_mul = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_mul; };
auto type_is_sum = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::sum; };
auto type_is_optimized_sum = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_sum; };
auto type_is_scale = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::scale; };
auto& cur_post_ops = onednn_fusing_map[node_id]; auto& cur_post_ops = onednn_fusing_map[node_id];
size_t cur_post_op_idx = 1; size_t cur_post_op_idx = 1;
@ -169,11 +196,11 @@ protected:
// Check and update post-op map if we already optimized something // Check and update post-op map if we already optimized something
for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) { for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) {
if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized_sum) if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type))
cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum; cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum;
else if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized_eltwise) else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type))
cur_post_ops[post_op_idx].op_type = onednn_post_op_type::eltwise_linear; cur_post_ops[post_op_idx].op_type = onednn_post_op_type::eltwise_linear;
else if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized) else if (type_is_optimized(cur_post_ops[post_op_idx].op_type))
cur_post_ops.erase(cur_post_ops.begin() + post_op_idx); cur_post_ops.erase(cur_post_ops.begin() + post_op_idx);
} }
@ -186,8 +213,7 @@ protected:
auto prev_type = cur_post_ops[prev_post_op_idx].op_type; auto prev_type = cur_post_ops[prev_post_op_idx].op_type;
// Ignore optimized operations for "previous" operation in our operation pair // Ignore optimized operations for "previous" operation in our operation pair
while ((prev_type == onednn_post_op_type::optimized || prev_type == onednn_post_op_type::optimized_sum || while (type_is_any_optimized(prev_type) && cur_post_op_idx < post_ops_size - 1) {
prev_type == onednn_post_op_type::optimized_eltwise) && cur_post_op_idx < post_ops_size - 1) {
prev_post_op_idx++; prev_post_op_idx++;
cur_post_op_idx++; cur_post_op_idx++;
prev_type = cur_post_ops[prev_post_op_idx].op_type; prev_type = cur_post_ops[prev_post_op_idx].op_type;
@ -195,80 +221,83 @@ protected:
} }
// Ignore optimized operations for "current" operation in our operation pair // Ignore optimized operations for "current" operation in our operation pair
while ((cur_type == onednn_post_op_type::optimized || cur_type == onednn_post_op_type::optimized_sum || while (type_is_any_optimized(cur_type) && cur_post_op_idx < post_ops_size - 1) {
cur_type == onednn_post_op_type::optimized_eltwise) && cur_post_op_idx < post_ops_size - 1) {
cur_post_op_idx++; cur_post_op_idx++;
cur_type = cur_post_ops[cur_post_op_idx].op_type; cur_type = cur_post_ops[cur_post_op_idx].op_type;
} }
auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx); auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx); auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
auto cur_type_is_optimized = cur_type == onednn_post_op_type::optimized ||
cur_type == onednn_post_op_type::optimized_sum ||
cur_type == onednn_post_op_type::optimized_eltwise;
auto prev_type_is_optimized = prev_type == onednn_post_op_type::optimized ||
prev_type == onednn_post_op_type::optimized_sum ||
prev_type == onednn_post_op_type::optimized_eltwise;
// If this is the last pair and it's optimized - add the last post-op and go out from the cycle // If this is the last pair and it's optimized - add the last post-op and go out from the cycle
if (cur_post_op_idx == post_ops_size - 1 && (cur_type_is_optimized || prev_type_is_optimized)) { if (cur_post_op_idx == post_ops_size - 1 && (type_is_any_optimized(cur_type) || type_is_any_optimized(prev_type))) {
if (!prev_type_is_optimized) { if (!type_is_any_optimized(prev_type)) {
add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx); add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx);
} }
if (!cur_type_is_optimized) { if (!type_is_any_optimized(cur_type)) {
add_post_op(cur_type, p_ops, optimized_p_ops, cur_idx); add_post_op(cur_type, p_ops, optimized_p_ops, cur_idx);
} }
break; break;
} }
auto equal_ops = cur_type == prev_type;
auto cur_type_is_binary_add_or_mul = cur_type == onednn_post_op_type::binary_add || cur_type == onednn_post_op_type::binary_mul;
auto prev_type_is_binary_add_or_mul = prev_type == onednn_post_op_type::binary_add || prev_type == onednn_post_op_type::binary_mul;
// Post-ops combinations which can be simplified // Post-ops combinations which can be simplified
auto eltw_and_eltw = equal_ops && cur_type == onednn_post_op_type::eltwise_linear; auto eltw_and_eltw = type_is_eltwise(cur_type) && type_is_eltwise(prev_type);
auto bin_and_eltw = cur_type_is_binary_add_or_mul && prev_type == onednn_post_op_type::eltwise_linear; auto bin_and_eltw = type_is_binary_add_or_mul(cur_type) && type_is_eltwise_linear(prev_type);
auto eltw_and_bin = cur_type == onednn_post_op_type::eltwise_linear && prev_type_is_binary_add_or_mul; auto eltw_and_bin = type_is_eltwise_linear(cur_type) && type_is_binary_add_or_mul(prev_type);
auto eltw_and_sum = cur_type == onednn_post_op_type::eltwise_linear && prev_type == onednn_post_op_type::sum; auto sum_and_eltw = type_is_sum(cur_type) && type_is_eltwise(prev_type);
auto eltw_and_scale = cur_type == onednn_post_op_type::eltwise_linear && prev_type == onednn_post_op_type::scale; auto eltw_and_scale = type_is_eltwise_linear(cur_type) && type_is_scale(prev_type);
auto can_try_optimize = eltw_and_eltw || auto can_try_optimize = eltw_and_eltw ||
bin_and_eltw || bin_and_eltw ||
eltw_and_bin || eltw_and_bin ||
eltw_and_sum || sum_and_eltw ||
eltw_and_scale; eltw_and_scale;
bool cur_ops_pair_is_optimized = false; bool cur_ops_pair_is_optimized = false;
if (can_try_optimize) { if (can_try_optimize) {
if (eltw_and_eltw) { if (eltw_and_eltw) {
dnnl::algorithm alg; dnnl::algorithm cur_alg, prev_alg;
float cur_scale, prev_scale, cur_alpha, prev_alpha, cur_beta, prev_beta; float cur_scale, prev_scale, cur_alpha, prev_alpha, cur_beta, prev_beta;
p_ops.get_params_eltwise(prev_idx, prev_scale, alg, prev_alpha, prev_beta); p_ops.get_params_eltwise(prev_idx, prev_scale, prev_alg, prev_alpha, prev_beta);
p_ops.get_params_eltwise(cur_idx, cur_scale, alg, cur_alpha, cur_beta); p_ops.get_params_eltwise(cur_idx, cur_scale, cur_alg, cur_alpha, cur_beta);
// Eltwise + eltwise pair can be optimized only if cur_alpha is equal to 1.0f auto eltw_linear_and_eltw_linear = type_is_eltwise_linear(cur_type) && type_is_eltwise_linear(prev_type);
if (cur_alpha == 1.0f && prev_scale == cur_scale) { auto eltw_linear_and_eltw_non_linear = type_is_eltwise_linear(cur_type) && !type_is_eltwise_linear(prev_type) && cur_beta == 0;
// eltwise_linear + eltwise_linear combination can be optimized always
if (eltw_linear_and_eltw_linear) {
dnnl::post_ops eltw_p_op; dnnl::post_ops eltw_p_op;
eltw_p_op.append_eltwise(cur_scale, alg, prev_alpha, cur_beta + prev_beta); float optimized_alpha = cur_alpha * prev_alpha * prev_scale;
float optimized_beta = cur_alpha * prev_beta * prev_scale + cur_beta;
float optimized_scale = cur_scale;
eltw_p_op.append_eltwise(optimized_scale, cur_alg, optimized_alpha, optimized_beta);
// Combine 2 eltwises into one // Combine 2 eltwises into one
add_post_op(cur_type, eltw_p_op, optimized_p_ops, 0); add_post_op(cur_type, eltw_p_op, optimized_p_ops, 0);
} else if (eltw_linear_and_eltw_non_linear) {
dnnl::post_ops eltw_p_op;
eltw_p_op.append_eltwise(cur_scale * prev_scale * cur_alpha, prev_alg, prev_alpha, prev_beta);
// Combine 2 eltwises into one
add_post_op(prev_type, eltw_p_op, optimized_p_ops, 0);
}
if (eltw_linear_and_eltw_linear || eltw_linear_and_eltw_non_linear) {
// Marked current and previous eltwise operations as 'optimized' (they will be ignored on the next iteration of cycle) // Marked current and previous eltwise operations as 'optimized' (they will be ignored on the next iteration of cycle)
cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized; cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized;
cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise; cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise;
// Set the flag if extra optimizations checking is needed // Set the flag if extra optimizations checking is needed
if (cur_post_op_idx < post_ops_size - 1) { if (cur_post_op_idx < post_ops_size - 1) {
if (cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::eltwise_linear || if (type_is_eltwise_linear(cur_post_ops[cur_post_op_idx + 1].op_type) ||
cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::binary_add || type_is_binary_add_or_mul(cur_post_ops[cur_post_op_idx + 1].op_type) ||
cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::binary_mul || type_is_optimized_eltwise(cur_post_ops[cur_post_op_idx + 1].op_type)) {
cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::optimized_eltwise) {
optimization_is_completed = true; optimization_is_completed = true;
} }
} }
cur_ops_pair_is_optimized = true; cur_ops_pair_is_optimized = true;
} }
} else if (bin_and_eltw) { } else if (bin_and_eltw) {
@ -285,11 +314,13 @@ protected:
auto bin_ops_can_be_optimized = cur_node.is_type<data>() && cur_node.is_constant() && auto bin_ops_can_be_optimized = cur_node.is_type<data>() && cur_node.is_constant() &&
cur_node.get_users().size() == 1 && desc.data_type() == dnnl_f32; cur_node.get_users().size() == 1 && desc.data_type() == dnnl_f32;
auto bin_add_and_eltw = alpha == 1.0f && scale == 1.0f && cur_type == onednn_post_op_type::binary_add && bin_ops_can_be_optimized; auto bin_add_and_eltw = alpha == 1.0f && scale == 1.0f && type_is_binary_add(cur_type) && bin_ops_can_be_optimized;
auto bin_mul_and_eltw = beta == 0.f && scale == 1.0f && cur_type == onednn_post_op_type::binary_mul && bin_ops_can_be_optimized; auto bin_mul_and_eltw = beta == 0.f && type_is_binary_mul(cur_type) && bin_ops_can_be_optimized;
if (bin_add_and_eltw || bin_mul_and_eltw) { if (bin_add_and_eltw || bin_mul_and_eltw) {
memory::ptr cur_bin_mem_ptr = cur_node.as<data>().get_attached_memory_ptr(); memory::ptr cur_bin_mem_ptr = cur_node.as<data>().get_attached_memory_ptr();
if (cur_bin_mem_ptr == nullptr)
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for bin + eltw");
auto& stream = cur_bin_mem_ptr->get_engine()->get_program_stream(); auto& stream = cur_bin_mem_ptr->get_engine()->get_program_stream();
mem_lock<float, mem_lock_type::write> bin_and_eltw_lock(cur_bin_mem_ptr, stream); mem_lock<float, mem_lock_type::write> bin_and_eltw_lock(cur_bin_mem_ptr, stream);
@ -302,7 +333,7 @@ protected:
} }
} else { } else {
for (size_t data_idx = 0; data_idx < cur_bin_mem_size; data_idx++) { for (size_t data_idx = 0; data_idx < cur_bin_mem_size; data_idx++) {
bin_and_eltw_lock[data_idx] *= alpha; bin_and_eltw_lock[data_idx] *= alpha * scale;
} }
} }
@ -325,11 +356,13 @@ protected:
auto bin_ops_can_be_optimized = prev_node.is_type<data>() && prev_node.is_constant() && auto bin_ops_can_be_optimized = prev_node.is_type<data>() && prev_node.is_constant() &&
prev_node.get_users().size() == 1 && desc.data_type() == dnnl_f32; prev_node.get_users().size() == 1 && desc.data_type() == dnnl_f32;
auto eltw_and_bin_add = alpha == 1.0f && scale == 1.0f && prev_type == onednn_post_op_type::binary_add && bin_ops_can_be_optimized; auto eltw_and_bin_add = alpha == 1.0f && scale == 1.0f && type_is_binary_add(prev_type) && bin_ops_can_be_optimized;
auto eltw_and_bin_mul = beta == 0.f && scale == 1.0f && prev_type == onednn_post_op_type::binary_mul && bin_ops_can_be_optimized; auto eltw_and_bin_mul = beta == 0.f && type_is_binary_mul(prev_type) && bin_ops_can_be_optimized;
if (eltw_and_bin_add || eltw_and_bin_mul) { if (eltw_and_bin_add || eltw_and_bin_mul) {
memory::ptr prev_bin_mem_ptr = prev_node.as<data>().get_attached_memory_ptr(); memory::ptr prev_bin_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
if (prev_bin_mem_ptr == nullptr)
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + bin");
auto& stream = prev_bin_mem_ptr->get_engine()->get_program_stream(); auto& stream = prev_bin_mem_ptr->get_engine()->get_program_stream();
mem_lock<float, mem_lock_type::write> eltw_and_bin_lock(prev_bin_mem_ptr, stream); mem_lock<float, mem_lock_type::write> eltw_and_bin_lock(prev_bin_mem_ptr, stream);
@ -342,7 +375,7 @@ protected:
} }
} else { } else {
for (size_t data_idx = 0; data_idx < prev_bin_mem_size; data_idx++) { for (size_t data_idx = 0; data_idx < prev_bin_mem_size; data_idx++) {
eltw_and_bin_lock[data_idx] *= alpha; eltw_and_bin_lock[data_idx] *= alpha * scale;
} }
} }
@ -351,63 +384,69 @@ protected:
cur_ops_pair_is_optimized = true; cur_ops_pair_is_optimized = true;
} }
} else if (eltw_and_sum) { } else if (sum_and_eltw) {
dnnl::algorithm alg; dnnl::algorithm alg;
float cur_scale, prev_scale, alpha, beta; float sum_scale, eltw_scale, alpha, beta;
dnnl::memory::data_type data_type; dnnl::memory::data_type data_type;
cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep); dnnl::algorithm next_alg;
float next_scale, next_alpha, next_beta;
size_t next_idx = cur_idx + 1;
size_t next_post_op_idx = cur_post_op_idx + 1;
p_ops.get_params_eltwise(cur_idx, cur_scale, alg, alpha, beta); bool can_optimize_eltw_and_sum = false;
p_ops.get_params_sum(prev_idx, prev_scale, data_type);
// Eltwise operations can use runtime non-constant data buffers, so check that memory buffers consist of constant data only if (cur_post_op_idx < post_ops_size - 1) {
auto eltw_ops_can_be_optimized = prev_node.is_type<data>() && prev_node.is_constant() && auto next_type = cur_post_ops[next_post_op_idx].op_type;
prev_node.get_users().size() == 1; if (type_is_eltwise_linear(next_type)) {
p_ops.get_params_eltwise(next_idx, next_scale, next_alg, next_alpha, next_beta);
// Eltwise can be inserted into the scale field of previous sum if cur_beta is equal to 0.f if (next_beta == 0)
if (beta == 0.f && cur_scale == 1.0f && eltw_ops_can_be_optimized) { can_optimize_eltw_and_sum = true;
dnnl::post_ops sum_p_op;
sum_p_op.append_sum(alpha * prev_scale, data_type);
// Insert cur eltwise into sum
add_post_op(prev_type, sum_p_op, optimized_p_ops, 0);
memory::ptr prev_eltw_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
auto& stream = prev_eltw_mem_ptr->get_engine()->get_program_stream();
mem_lock<float, mem_lock_type::write> eltw_and_sum_lock(prev_eltw_mem_ptr, stream);
size_t prev_eltw_mem_size = prev_node.get_output_layout().count();
// Also multiply sum on alpha for getting valid results
for (size_t data_idx = 0; data_idx < prev_eltw_mem_size; data_idx++) {
eltw_and_sum_lock[data_idx] *= alpha;
} }
}
// Marked current and previous operations as 'optimized' (they will be ignored on the next iteration of cycle) // Try to optimize eltwise (any) + sum + eltwise_linear (with beta = 0) chain of operations
cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized; if (can_optimize_eltw_and_sum) {
cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_sum; p_ops.get_params_sum(cur_idx, sum_scale, data_type);
p_ops.get_params_eltwise(prev_idx, eltw_scale, alg, alpha, beta);
dnnl::post_ops eltw_p_op_prev, sum_p_op;
eltw_p_op_prev.append_eltwise(eltw_scale * next_alpha * next_scale, alg, alpha, beta);
sum_p_op.append_sum(sum_scale * next_alpha, data_type);
add_post_op(prev_type, eltw_p_op_prev, optimized_p_ops, 0);
add_post_op(cur_type, sum_p_op, optimized_p_ops, 0);
// Marked current, previous and next operations as 'optimized' (they will be ignored on the next iteration of cycle)
cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise;
cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized_sum;
cur_post_ops[next_post_op_idx].op_type = onednn_post_op_type::optimized;
// Set the flag if extra optimizations checking is needed // Set the flag if extra optimizations checking is needed
if (cur_post_op_idx < post_ops_size - 1) { if (next_post_op_idx < post_ops_size - 1) {
if (cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::eltwise_linear || if (type_is_eltwise_linear(cur_post_ops[next_post_op_idx + 1].op_type) ||
cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::optimized_eltwise) { type_is_optimized_eltwise(cur_post_ops[next_post_op_idx + 1].op_type)) {
optimization_is_completed = true; optimization_is_completed = true;
} }
} }
cur_ops_pair_is_optimized = true; cur_ops_pair_is_optimized = true;
} }
} else if (eltw_and_scale) { } else if (eltw_and_scale) {
dnnl::algorithm alg; dnnl::algorithm alg;
float cur_scale, alpha, beta; float eltw_scale, alpha, beta;
cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep); cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep);
p_ops.get_params_eltwise(cur_idx, cur_scale, alg, alpha, beta); p_ops.get_params_eltwise(cur_idx, eltw_scale, alg, alpha, beta);
// Eltwise can be inserted into output_scale if cur_beta is equal to 0.f and cur_scale is equal to 1.0f // Eltwise can be inserted into the output_scale if cur_beta is equal to 0.f
if (beta == 0.f && cur_scale == 1.0f && prev_node.get_output_layout().data_type == data_types::f32) { if (beta == 0.f && prev_node.get_output_layout().data_type == data_types::f32) {
memory::ptr prev_scale_mem_ptr = prev_node.as<data>().get_attached_memory_ptr(); memory::ptr prev_scale_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
if (prev_scale_mem_ptr == nullptr)
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + scale");
auto& stream = prev_scale_mem_ptr->get_engine()->get_program_stream(); auto& stream = prev_scale_mem_ptr->get_engine()->get_program_stream();
mem_lock<float, mem_lock_type::write> eltw_and_scale_lock(prev_scale_mem_ptr, stream); mem_lock<float, mem_lock_type::write> eltw_and_scale_lock(prev_scale_mem_ptr, stream);
@ -415,7 +454,7 @@ protected:
// Update all scale coefficients // Update all scale coefficients
for (size_t data_idx = 0; data_idx < prev_scale_mem_size; data_idx++) { for (size_t data_idx = 0; data_idx < prev_scale_mem_size; data_idx++) {
eltw_and_scale_lock[data_idx] *= alpha; eltw_and_scale_lock[data_idx] *= alpha * eltw_scale;
} }
// Marked current eltwise operation as 'optimized' (it will be ignored on the next iteration of cycle) // Marked current eltwise operation as 'optimized' (it will be ignored on the next iteration of cycle)

View File

@ -9433,6 +9433,364 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish_onednn,
bc_test_params{CASE_CONV_S8S8_13, 2, 7}, bc_test_params{CASE_CONV_S8S8_13, 2, 7},
bc_test_params{CASE_CONV_S8S8_15, 2, 7}, bc_test_params{CASE_CONV_S8S8_15, 2, 7},
})); }));
/* ----------------------------------------------------------------------------------------------------- */
/* ------------------------------ OneDNN post-ops cases with optimizations ----------------------------- */
/* ----------------------------------------------------------------------------------------------------- */
// Before optimization: eltw_linear + eltw_linear
// After optimization: eltw_linear
// Limitations: no
// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_linear:1:-128
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:-0.5
class post_ops_optimizations_onednn_eltw_linear_eltw_linear : public ConvFusingTestOneDNN {};
TEST_P(post_ops_optimizations_onednn_eltw_linear_eltw_linear, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_single_element_layout(p), -10)),
data("in_hi", get_mem(get_single_element_layout(p), 10)),
data("out_lo", get_mem(get_single_element_layout(p), -128)),
data("out_hi", get_mem(get_single_element_layout(p), 127)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_linear_eltw_linear,
::testing::ValuesIn(std::vector<bc_test_params>{
// cases with batch = 1
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
// cases with batch = 16
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
// cases with batch = 32
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
bc_test_params{CASE_CONV_U8S8_14, 2, 3},
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
bc_test_params{CASE_CONV_S8S8_15, 2, 3},
}));
// Before optimization: eltw_non_linear + eltw_linear
// After optimization: eltw_non_linear
// Limitations: beta = 0 in eltw_linear
// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:2.00784+eltwise_clip:0:512
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round:0:0:2.00784+eltwise_clip:0:512
class post_ops_optimizations_onednn_eltw_non_linear_eltw_linear : public ConvFusingTestOneDNN {};
TEST_P(post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_single_element_layout(p), -10)),
data("in_hi", get_mem(get_single_element_layout(p), 10)),
data("out_lo", get_mem(get_single_element_layout(p), 0)),
data("out_hi", get_mem(get_single_element_layout(p), 512)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::f32),
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_non_linear_eltw_linear,
::testing::ValuesIn(std::vector<bc_test_params>{
// cases with batch = 1
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
// cases with batch = 16
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
// cases with batch = 32
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
bc_test_params{CASE_CONV_U8S8_14, 2, 3},
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
bc_test_params{CASE_CONV_S8S8_15, 2, 3},
}));
// Before optimization: binary_add + eltw_linear
// After optimization: binary_add
// Limitations: alpha = 1 and scale = 1 in eltw_linear; binary_add is a constant compile-time buffer
// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_linear:1:-127+eltwise_clip:-127:127
// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_clip:-127:127
class post_ops_optimizations_onednn_binary_add_eltw_linear : public ConvFusingTestOneDNN {};
TEST_P(post_ops_optimizations_onednn_binary_add_eltw_linear, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo", get_mem(get_single_element_layout(p), -127)),
data("out_hi", get_mem(get_single_element_layout(p), 127)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_eltw_linear,
::testing::ValuesIn(std::vector<bc_test_params>{
// cases with batch = 1
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
// cases with batch = 16
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
// cases with batch = 32
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
bc_test_params{CASE_CONV_U8S8_14, 2, 3},
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
bc_test_params{CASE_CONV_S8S8_15, 2, 3},
}));
// Before optimization: binary_mul + eltw_linear
// After optimization: binary_mul
// Limitations: beta = 0 in eltw_linear; binary_mul is a constant compile-time buffer
// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_linear:2.01575+eltwise_clip:0:512
// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_clip:0:512
class post_ops_optimizations_onednn_binary_mul_eltw_linear : public ConvFusingTestOneDNN {};
TEST_P(post_ops_optimizations_onednn_binary_mul_eltw_linear, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), -1, 1)),
data("in_lo", get_mem(get_per_channel_layout(p), 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo", get_mem(get_single_element_layout(p), 0)),
data("out_hi", get_mem(get_single_element_layout(p), 512)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
eltwise("eltwise", { "conv_prim", "eltwise_data" }, eltwise_mode::prod),
quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_mul_eltw_linear,
::testing::ValuesIn(std::vector<bc_test_params>{
// cases with batch = 1
bc_test_params{CASE_CONV_U8S8_1, 2, 4},
bc_test_params{CASE_CONV_U8S8_2, 2, 4},
bc_test_params{CASE_CONV_U8S8_3, 2, 4},
bc_test_params{CASE_CONV_S8S8_1, 2, 4},
bc_test_params{CASE_CONV_S8S8_2, 2, 4},
bc_test_params{CASE_CONV_S8S8_3, 2, 4},
// cases with batch = 16
bc_test_params{CASE_CONV_U8S8_9, 2, 4},
bc_test_params{CASE_CONV_U8S8_10, 2, 4},
bc_test_params{CASE_CONV_S8S8_9, 2, 4},
bc_test_params{CASE_CONV_S8S8_10, 2, 4},
// cases with batch = 32
bc_test_params{CASE_CONV_U8S8_11, 2, 4},
bc_test_params{CASE_CONV_U8S8_12, 2, 4},
bc_test_params{CASE_CONV_U8S8_13, 2, 4},
bc_test_params{CASE_CONV_S8S8_12, 2, 4},
bc_test_params{CASE_CONV_S8S8_13, 2, 4},
bc_test_params{CASE_CONV_S8S8_14, 2, 4},
}));
// Before optimization: o_scale + eltw_linear
// After optimization: o_scale
// Limitations: beta = 0 in eltw_linear
// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:eltwise_linear:2.01575+eltwise_clip:0:512
// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:eltwise_clip:0:512
class post_ops_optimizations_onednn_oscale_eltw_linear : public ConvFusingTestOneDNN {};
TEST_P(post_ops_optimizations_onednn_oscale_eltw_linear, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo", get_mem(get_single_element_layout(p), 0)),
data("out_hi", get_mem(get_single_element_layout(p), 512)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_oscale_eltw_linear,
::testing::ValuesIn(std::vector<bc_test_params>{
// cases with batch = 1
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
// cases with batch = 16
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
// cases with batch = 32
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
}));
// Before optimization: eltw_any + sum + eltw_linear
// After optimization: eltw_any + sum
// Limitations: beta = 0 in eltw_linear
// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_relu+sum:1:0:u8+eltwise_linear:12.7+eltwise_clip:0:127
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_relu:0:0:12.7+sum:12.7:0:u8+eltwise_clip:0:127
class post_ops_optimizations_onednn_eltw_any_sum_eltw_linear : public ConvFusingTestOneDNN {};
TEST_P(post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_single_element_layout(p), 0)),
data("in_hi", get_mem(get_single_element_layout(p), 10)),
data("out_lo", get_mem(get_single_element_layout(p), 0)),
data("out_hi", get_mem(get_single_element_layout(p), 127)),
data("eltwise_data", get_mem(get_output_layout(p))),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
activation("activation", "conv_prim", activation_func::relu_negative_slope),
eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 128, data_types::u8),
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_any_sum_eltw_linear,
::testing::ValuesIn(std::vector<bc_test_params>{
// cases with batch = 1
bc_test_params{CASE_CONV_U8S8_1, 2, 5},
bc_test_params{CASE_CONV_U8S8_2, 2, 5},
bc_test_params{CASE_CONV_U8S8_3, 2, 5},
bc_test_params{CASE_CONV_S8S8_1, 2, 5},
bc_test_params{CASE_CONV_S8S8_2, 2, 5},
bc_test_params{CASE_CONV_S8S8_3, 2, 5},
// cases with batch = 16
bc_test_params{CASE_CONV_U8S8_10, 2, 5},
bc_test_params{CASE_CONV_S8S8_10, 2, 5},
// cases with batch = 32
bc_test_params{CASE_CONV_U8S8_11, 2, 5},
bc_test_params{CASE_CONV_U8S8_12, 2, 5},
bc_test_params{CASE_CONV_U8S8_13, 2, 5},
bc_test_params{CASE_CONV_U8S8_14, 2, 5},
bc_test_params{CASE_CONV_S8S8_12, 2, 5},
bc_test_params{CASE_CONV_S8S8_13, 2, 5},
bc_test_params{CASE_CONV_S8S8_14, 2, 5},
bc_test_params{CASE_CONV_S8S8_15, 2, 5},
}));
// Input range uses in 2 cases: not per-tensor output range or out_lo > out_hi
// Here's out_lo > out_hi and no optimizations
// DNNL_VERBOSE log: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:-1:127
class post_ops_optimizations_input_range : public ConvFusingTestOneDNN {};
TEST_P(post_ops_optimizations_input_range, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_single_element_layout(p), -10)),
data("in_hi", get_mem(get_single_element_layout(p), 10)),
data("out_lo", get_mem(get_single_element_layout(p), 127)),
data("out_hi", get_mem(get_single_element_layout(p), -128)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range,
::testing::ValuesIn(std::vector<bc_test_params>{
// cases with batch = 1
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
// cases with batch = 16
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
// cases with batch = 32
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
bc_test_params{CASE_CONV_U8S8_14, 2, 3},
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
bc_test_params{CASE_CONV_S8S8_15, 2, 3},
}));
#endif #endif