[GPU] New OneDNN post-ops optimizations and fusing tests (#8056)
* [GPU] New OneDNN post-ops optimizations and fusing tests * [GPU] Code refactoring by github comments Failing test disabled in #8109
This commit is contained in:
parent
6f862822e1
commit
262c87c6e1
@ -161,6 +161,33 @@ protected:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Check that post-op type is any optimized
|
||||||
|
auto type_is_any_optimized = [](onednn_post_op_type type) -> bool {
|
||||||
|
return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum ||
|
||||||
|
type == onednn_post_op_type::optimized_eltwise;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check that post-op type is eltwise
|
||||||
|
auto type_is_eltwise = [](onednn_post_op_type type) -> bool {
|
||||||
|
return type == onednn_post_op_type::eltwise_round || type == onednn_post_op_type::eltwise_linear ||
|
||||||
|
type == onednn_post_op_type::eltwise_clip || type == onednn_post_op_type::eltwise_act;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check that post-op type is binary_add or binary_mul
|
||||||
|
auto type_is_binary_add_or_mul = [](onednn_post_op_type type) -> bool {
|
||||||
|
return type == onednn_post_op_type::binary_add || type == onednn_post_op_type::binary_mul;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Simple post-op type checks
|
||||||
|
auto type_is_optimized = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized; };
|
||||||
|
auto type_is_eltwise_linear = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::eltwise_linear; };
|
||||||
|
auto type_is_optimized_eltwise = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_eltwise; };
|
||||||
|
auto type_is_binary_add = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_add; };
|
||||||
|
auto type_is_binary_mul = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_mul; };
|
||||||
|
auto type_is_sum = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::sum; };
|
||||||
|
auto type_is_optimized_sum = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_sum; };
|
||||||
|
auto type_is_scale = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::scale; };
|
||||||
|
|
||||||
auto& cur_post_ops = onednn_fusing_map[node_id];
|
auto& cur_post_ops = onednn_fusing_map[node_id];
|
||||||
|
|
||||||
size_t cur_post_op_idx = 1;
|
size_t cur_post_op_idx = 1;
|
||||||
@ -169,11 +196,11 @@ protected:
|
|||||||
|
|
||||||
// Check and update post-op map if we already optimized something
|
// Check and update post-op map if we already optimized something
|
||||||
for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) {
|
for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) {
|
||||||
if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized_sum)
|
if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type))
|
||||||
cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum;
|
cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum;
|
||||||
else if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized_eltwise)
|
else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type))
|
||||||
cur_post_ops[post_op_idx].op_type = onednn_post_op_type::eltwise_linear;
|
cur_post_ops[post_op_idx].op_type = onednn_post_op_type::eltwise_linear;
|
||||||
else if (cur_post_ops[post_op_idx].op_type == onednn_post_op_type::optimized)
|
else if (type_is_optimized(cur_post_ops[post_op_idx].op_type))
|
||||||
cur_post_ops.erase(cur_post_ops.begin() + post_op_idx);
|
cur_post_ops.erase(cur_post_ops.begin() + post_op_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -186,8 +213,7 @@ protected:
|
|||||||
auto prev_type = cur_post_ops[prev_post_op_idx].op_type;
|
auto prev_type = cur_post_ops[prev_post_op_idx].op_type;
|
||||||
|
|
||||||
// Ignore optimized operations for "previous" operation in our operation pair
|
// Ignore optimized operations for "previous" operation in our operation pair
|
||||||
while ((prev_type == onednn_post_op_type::optimized || prev_type == onednn_post_op_type::optimized_sum ||
|
while (type_is_any_optimized(prev_type) && cur_post_op_idx < post_ops_size - 1) {
|
||||||
prev_type == onednn_post_op_type::optimized_eltwise) && cur_post_op_idx < post_ops_size - 1) {
|
|
||||||
prev_post_op_idx++;
|
prev_post_op_idx++;
|
||||||
cur_post_op_idx++;
|
cur_post_op_idx++;
|
||||||
prev_type = cur_post_ops[prev_post_op_idx].op_type;
|
prev_type = cur_post_ops[prev_post_op_idx].op_type;
|
||||||
@ -195,80 +221,83 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Ignore optimized operations for "current" operation in our operation pair
|
// Ignore optimized operations for "current" operation in our operation pair
|
||||||
while ((cur_type == onednn_post_op_type::optimized || cur_type == onednn_post_op_type::optimized_sum ||
|
while (type_is_any_optimized(cur_type) && cur_post_op_idx < post_ops_size - 1) {
|
||||||
cur_type == onednn_post_op_type::optimized_eltwise) && cur_post_op_idx < post_ops_size - 1) {
|
|
||||||
cur_post_op_idx++;
|
cur_post_op_idx++;
|
||||||
cur_type = cur_post_ops[cur_post_op_idx].op_type;
|
cur_type = cur_post_ops[cur_post_op_idx].op_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
|
auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
|
||||||
auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
|
auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
|
||||||
auto cur_type_is_optimized = cur_type == onednn_post_op_type::optimized ||
|
|
||||||
cur_type == onednn_post_op_type::optimized_sum ||
|
|
||||||
cur_type == onednn_post_op_type::optimized_eltwise;
|
|
||||||
auto prev_type_is_optimized = prev_type == onednn_post_op_type::optimized ||
|
|
||||||
prev_type == onednn_post_op_type::optimized_sum ||
|
|
||||||
prev_type == onednn_post_op_type::optimized_eltwise;
|
|
||||||
|
|
||||||
// If this is the last pair and it's optimized - add the last post-op and go out from the cycle
|
// If this is the last pair and it's optimized - add the last post-op and go out from the cycle
|
||||||
if (cur_post_op_idx == post_ops_size - 1 && (cur_type_is_optimized || prev_type_is_optimized)) {
|
if (cur_post_op_idx == post_ops_size - 1 && (type_is_any_optimized(cur_type) || type_is_any_optimized(prev_type))) {
|
||||||
if (!prev_type_is_optimized) {
|
if (!type_is_any_optimized(prev_type)) {
|
||||||
add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx);
|
add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx);
|
||||||
}
|
}
|
||||||
if (!cur_type_is_optimized) {
|
if (!type_is_any_optimized(cur_type)) {
|
||||||
add_post_op(cur_type, p_ops, optimized_p_ops, cur_idx);
|
add_post_op(cur_type, p_ops, optimized_p_ops, cur_idx);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto equal_ops = cur_type == prev_type;
|
|
||||||
auto cur_type_is_binary_add_or_mul = cur_type == onednn_post_op_type::binary_add || cur_type == onednn_post_op_type::binary_mul;
|
|
||||||
auto prev_type_is_binary_add_or_mul = prev_type == onednn_post_op_type::binary_add || prev_type == onednn_post_op_type::binary_mul;
|
|
||||||
|
|
||||||
// Post-ops combinations which can be simplified
|
// Post-ops combinations which can be simplified
|
||||||
auto eltw_and_eltw = equal_ops && cur_type == onednn_post_op_type::eltwise_linear;
|
auto eltw_and_eltw = type_is_eltwise(cur_type) && type_is_eltwise(prev_type);
|
||||||
auto bin_and_eltw = cur_type_is_binary_add_or_mul && prev_type == onednn_post_op_type::eltwise_linear;
|
auto bin_and_eltw = type_is_binary_add_or_mul(cur_type) && type_is_eltwise_linear(prev_type);
|
||||||
auto eltw_and_bin = cur_type == onednn_post_op_type::eltwise_linear && prev_type_is_binary_add_or_mul;
|
auto eltw_and_bin = type_is_eltwise_linear(cur_type) && type_is_binary_add_or_mul(prev_type);
|
||||||
auto eltw_and_sum = cur_type == onednn_post_op_type::eltwise_linear && prev_type == onednn_post_op_type::sum;
|
auto sum_and_eltw = type_is_sum(cur_type) && type_is_eltwise(prev_type);
|
||||||
auto eltw_and_scale = cur_type == onednn_post_op_type::eltwise_linear && prev_type == onednn_post_op_type::scale;
|
auto eltw_and_scale = type_is_eltwise_linear(cur_type) && type_is_scale(prev_type);
|
||||||
|
|
||||||
auto can_try_optimize = eltw_and_eltw ||
|
auto can_try_optimize = eltw_and_eltw ||
|
||||||
bin_and_eltw ||
|
bin_and_eltw ||
|
||||||
eltw_and_bin ||
|
eltw_and_bin ||
|
||||||
eltw_and_sum ||
|
sum_and_eltw ||
|
||||||
eltw_and_scale;
|
eltw_and_scale;
|
||||||
|
|
||||||
bool cur_ops_pair_is_optimized = false;
|
bool cur_ops_pair_is_optimized = false;
|
||||||
|
|
||||||
if (can_try_optimize) {
|
if (can_try_optimize) {
|
||||||
if (eltw_and_eltw) {
|
if (eltw_and_eltw) {
|
||||||
dnnl::algorithm alg;
|
dnnl::algorithm cur_alg, prev_alg;
|
||||||
float cur_scale, prev_scale, cur_alpha, prev_alpha, cur_beta, prev_beta;
|
float cur_scale, prev_scale, cur_alpha, prev_alpha, cur_beta, prev_beta;
|
||||||
|
|
||||||
p_ops.get_params_eltwise(prev_idx, prev_scale, alg, prev_alpha, prev_beta);
|
p_ops.get_params_eltwise(prev_idx, prev_scale, prev_alg, prev_alpha, prev_beta);
|
||||||
p_ops.get_params_eltwise(cur_idx, cur_scale, alg, cur_alpha, cur_beta);
|
p_ops.get_params_eltwise(cur_idx, cur_scale, cur_alg, cur_alpha, cur_beta);
|
||||||
|
|
||||||
// Eltwise + eltwise pair can be optimized only if cur_alpha is equal to 1.0f
|
auto eltw_linear_and_eltw_linear = type_is_eltwise_linear(cur_type) && type_is_eltwise_linear(prev_type);
|
||||||
if (cur_alpha == 1.0f && prev_scale == cur_scale) {
|
auto eltw_linear_and_eltw_non_linear = type_is_eltwise_linear(cur_type) && !type_is_eltwise_linear(prev_type) && cur_beta == 0;
|
||||||
|
|
||||||
|
// eltwise_linear + eltwise_linear combination can be optimized always
|
||||||
|
if (eltw_linear_and_eltw_linear) {
|
||||||
dnnl::post_ops eltw_p_op;
|
dnnl::post_ops eltw_p_op;
|
||||||
eltw_p_op.append_eltwise(cur_scale, alg, prev_alpha, cur_beta + prev_beta);
|
float optimized_alpha = cur_alpha * prev_alpha * prev_scale;
|
||||||
|
float optimized_beta = cur_alpha * prev_beta * prev_scale + cur_beta;
|
||||||
|
float optimized_scale = cur_scale;
|
||||||
|
eltw_p_op.append_eltwise(optimized_scale, cur_alg, optimized_alpha, optimized_beta);
|
||||||
|
|
||||||
// Combine 2 eltwises into one
|
// Combine 2 eltwises into one
|
||||||
add_post_op(cur_type, eltw_p_op, optimized_p_ops, 0);
|
add_post_op(cur_type, eltw_p_op, optimized_p_ops, 0);
|
||||||
|
} else if (eltw_linear_and_eltw_non_linear) {
|
||||||
|
dnnl::post_ops eltw_p_op;
|
||||||
|
eltw_p_op.append_eltwise(cur_scale * prev_scale * cur_alpha, prev_alg, prev_alpha, prev_beta);
|
||||||
|
|
||||||
|
// Combine 2 eltwises into one
|
||||||
|
add_post_op(prev_type, eltw_p_op, optimized_p_ops, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (eltw_linear_and_eltw_linear || eltw_linear_and_eltw_non_linear) {
|
||||||
// Marked current and previous eltwise operations as 'optimized' (they will be ignored on the next iteration of cycle)
|
// Marked current and previous eltwise operations as 'optimized' (they will be ignored on the next iteration of cycle)
|
||||||
cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized;
|
cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized;
|
||||||
cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise;
|
cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise;
|
||||||
|
|
||||||
// Set the flag if extra optimizations checking is needed
|
// Set the flag if extra optimizations checking is needed
|
||||||
if (cur_post_op_idx < post_ops_size - 1) {
|
if (cur_post_op_idx < post_ops_size - 1) {
|
||||||
if (cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::eltwise_linear ||
|
if (type_is_eltwise_linear(cur_post_ops[cur_post_op_idx + 1].op_type) ||
|
||||||
cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::binary_add ||
|
type_is_binary_add_or_mul(cur_post_ops[cur_post_op_idx + 1].op_type) ||
|
||||||
cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::binary_mul ||
|
type_is_optimized_eltwise(cur_post_ops[cur_post_op_idx + 1].op_type)) {
|
||||||
cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::optimized_eltwise) {
|
|
||||||
optimization_is_completed = true;
|
optimization_is_completed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cur_ops_pair_is_optimized = true;
|
cur_ops_pair_is_optimized = true;
|
||||||
}
|
}
|
||||||
} else if (bin_and_eltw) {
|
} else if (bin_and_eltw) {
|
||||||
@ -285,11 +314,13 @@ protected:
|
|||||||
auto bin_ops_can_be_optimized = cur_node.is_type<data>() && cur_node.is_constant() &&
|
auto bin_ops_can_be_optimized = cur_node.is_type<data>() && cur_node.is_constant() &&
|
||||||
cur_node.get_users().size() == 1 && desc.data_type() == dnnl_f32;
|
cur_node.get_users().size() == 1 && desc.data_type() == dnnl_f32;
|
||||||
|
|
||||||
auto bin_add_and_eltw = alpha == 1.0f && scale == 1.0f && cur_type == onednn_post_op_type::binary_add && bin_ops_can_be_optimized;
|
auto bin_add_and_eltw = alpha == 1.0f && scale == 1.0f && type_is_binary_add(cur_type) && bin_ops_can_be_optimized;
|
||||||
auto bin_mul_and_eltw = beta == 0.f && scale == 1.0f && cur_type == onednn_post_op_type::binary_mul && bin_ops_can_be_optimized;
|
auto bin_mul_and_eltw = beta == 0.f && type_is_binary_mul(cur_type) && bin_ops_can_be_optimized;
|
||||||
|
|
||||||
if (bin_add_and_eltw || bin_mul_and_eltw) {
|
if (bin_add_and_eltw || bin_mul_and_eltw) {
|
||||||
memory::ptr cur_bin_mem_ptr = cur_node.as<data>().get_attached_memory_ptr();
|
memory::ptr cur_bin_mem_ptr = cur_node.as<data>().get_attached_memory_ptr();
|
||||||
|
if (cur_bin_mem_ptr == nullptr)
|
||||||
|
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for bin + eltw");
|
||||||
auto& stream = cur_bin_mem_ptr->get_engine()->get_program_stream();
|
auto& stream = cur_bin_mem_ptr->get_engine()->get_program_stream();
|
||||||
mem_lock<float, mem_lock_type::write> bin_and_eltw_lock(cur_bin_mem_ptr, stream);
|
mem_lock<float, mem_lock_type::write> bin_and_eltw_lock(cur_bin_mem_ptr, stream);
|
||||||
|
|
||||||
@ -302,7 +333,7 @@ protected:
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (size_t data_idx = 0; data_idx < cur_bin_mem_size; data_idx++) {
|
for (size_t data_idx = 0; data_idx < cur_bin_mem_size; data_idx++) {
|
||||||
bin_and_eltw_lock[data_idx] *= alpha;
|
bin_and_eltw_lock[data_idx] *= alpha * scale;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -325,11 +356,13 @@ protected:
|
|||||||
auto bin_ops_can_be_optimized = prev_node.is_type<data>() && prev_node.is_constant() &&
|
auto bin_ops_can_be_optimized = prev_node.is_type<data>() && prev_node.is_constant() &&
|
||||||
prev_node.get_users().size() == 1 && desc.data_type() == dnnl_f32;
|
prev_node.get_users().size() == 1 && desc.data_type() == dnnl_f32;
|
||||||
|
|
||||||
auto eltw_and_bin_add = alpha == 1.0f && scale == 1.0f && prev_type == onednn_post_op_type::binary_add && bin_ops_can_be_optimized;
|
auto eltw_and_bin_add = alpha == 1.0f && scale == 1.0f && type_is_binary_add(prev_type) && bin_ops_can_be_optimized;
|
||||||
auto eltw_and_bin_mul = beta == 0.f && scale == 1.0f && prev_type == onednn_post_op_type::binary_mul && bin_ops_can_be_optimized;
|
auto eltw_and_bin_mul = beta == 0.f && type_is_binary_mul(prev_type) && bin_ops_can_be_optimized;
|
||||||
|
|
||||||
if (eltw_and_bin_add || eltw_and_bin_mul) {
|
if (eltw_and_bin_add || eltw_and_bin_mul) {
|
||||||
memory::ptr prev_bin_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
|
memory::ptr prev_bin_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
|
||||||
|
if (prev_bin_mem_ptr == nullptr)
|
||||||
|
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + bin");
|
||||||
auto& stream = prev_bin_mem_ptr->get_engine()->get_program_stream();
|
auto& stream = prev_bin_mem_ptr->get_engine()->get_program_stream();
|
||||||
mem_lock<float, mem_lock_type::write> eltw_and_bin_lock(prev_bin_mem_ptr, stream);
|
mem_lock<float, mem_lock_type::write> eltw_and_bin_lock(prev_bin_mem_ptr, stream);
|
||||||
|
|
||||||
@ -342,7 +375,7 @@ protected:
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (size_t data_idx = 0; data_idx < prev_bin_mem_size; data_idx++) {
|
for (size_t data_idx = 0; data_idx < prev_bin_mem_size; data_idx++) {
|
||||||
eltw_and_bin_lock[data_idx] *= alpha;
|
eltw_and_bin_lock[data_idx] *= alpha * scale;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -351,63 +384,69 @@ protected:
|
|||||||
|
|
||||||
cur_ops_pair_is_optimized = true;
|
cur_ops_pair_is_optimized = true;
|
||||||
}
|
}
|
||||||
} else if (eltw_and_sum) {
|
} else if (sum_and_eltw) {
|
||||||
dnnl::algorithm alg;
|
dnnl::algorithm alg;
|
||||||
float cur_scale, prev_scale, alpha, beta;
|
float sum_scale, eltw_scale, alpha, beta;
|
||||||
dnnl::memory::data_type data_type;
|
dnnl::memory::data_type data_type;
|
||||||
|
|
||||||
cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep);
|
dnnl::algorithm next_alg;
|
||||||
|
float next_scale, next_alpha, next_beta;
|
||||||
|
size_t next_idx = cur_idx + 1;
|
||||||
|
size_t next_post_op_idx = cur_post_op_idx + 1;
|
||||||
|
|
||||||
p_ops.get_params_eltwise(cur_idx, cur_scale, alg, alpha, beta);
|
bool can_optimize_eltw_and_sum = false;
|
||||||
p_ops.get_params_sum(prev_idx, prev_scale, data_type);
|
|
||||||
|
|
||||||
// Eltwise operations can use runtime non-constant data buffers, so check that memory buffers consist of constant data only
|
if (cur_post_op_idx < post_ops_size - 1) {
|
||||||
auto eltw_ops_can_be_optimized = prev_node.is_type<data>() && prev_node.is_constant() &&
|
auto next_type = cur_post_ops[next_post_op_idx].op_type;
|
||||||
prev_node.get_users().size() == 1;
|
if (type_is_eltwise_linear(next_type)) {
|
||||||
|
p_ops.get_params_eltwise(next_idx, next_scale, next_alg, next_alpha, next_beta);
|
||||||
|
|
||||||
// Eltwise can be inserted into the scale field of previous sum if cur_beta is equal to 0.f
|
if (next_beta == 0)
|
||||||
if (beta == 0.f && cur_scale == 1.0f && eltw_ops_can_be_optimized) {
|
can_optimize_eltw_and_sum = true;
|
||||||
dnnl::post_ops sum_p_op;
|
|
||||||
sum_p_op.append_sum(alpha * prev_scale, data_type);
|
|
||||||
|
|
||||||
// Insert cur eltwise into sum
|
|
||||||
add_post_op(prev_type, sum_p_op, optimized_p_ops, 0);
|
|
||||||
|
|
||||||
memory::ptr prev_eltw_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
|
|
||||||
auto& stream = prev_eltw_mem_ptr->get_engine()->get_program_stream();
|
|
||||||
mem_lock<float, mem_lock_type::write> eltw_and_sum_lock(prev_eltw_mem_ptr, stream);
|
|
||||||
|
|
||||||
size_t prev_eltw_mem_size = prev_node.get_output_layout().count();
|
|
||||||
|
|
||||||
// Also multiply sum on alpha for getting valid results
|
|
||||||
for (size_t data_idx = 0; data_idx < prev_eltw_mem_size; data_idx++) {
|
|
||||||
eltw_and_sum_lock[data_idx] *= alpha;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Marked current and previous operations as 'optimized' (they will be ignored on the next iteration of cycle)
|
// Try to optimize eltwise (any) + sum + eltwise_linear (with beta = 0) chain of operations
|
||||||
cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized;
|
if (can_optimize_eltw_and_sum) {
|
||||||
cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_sum;
|
p_ops.get_params_sum(cur_idx, sum_scale, data_type);
|
||||||
|
p_ops.get_params_eltwise(prev_idx, eltw_scale, alg, alpha, beta);
|
||||||
|
|
||||||
|
dnnl::post_ops eltw_p_op_prev, sum_p_op;
|
||||||
|
|
||||||
|
eltw_p_op_prev.append_eltwise(eltw_scale * next_alpha * next_scale, alg, alpha, beta);
|
||||||
|
sum_p_op.append_sum(sum_scale * next_alpha, data_type);
|
||||||
|
|
||||||
|
add_post_op(prev_type, eltw_p_op_prev, optimized_p_ops, 0);
|
||||||
|
add_post_op(cur_type, sum_p_op, optimized_p_ops, 0);
|
||||||
|
|
||||||
|
// Marked current, previous and next operations as 'optimized' (they will be ignored on the next iteration of cycle)
|
||||||
|
cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise;
|
||||||
|
cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized_sum;
|
||||||
|
cur_post_ops[next_post_op_idx].op_type = onednn_post_op_type::optimized;
|
||||||
|
|
||||||
// Set the flag if extra optimizations checking is needed
|
// Set the flag if extra optimizations checking is needed
|
||||||
if (cur_post_op_idx < post_ops_size - 1) {
|
if (next_post_op_idx < post_ops_size - 1) {
|
||||||
if (cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::eltwise_linear ||
|
if (type_is_eltwise_linear(cur_post_ops[next_post_op_idx + 1].op_type) ||
|
||||||
cur_post_ops[cur_post_op_idx + 1].op_type == onednn_post_op_type::optimized_eltwise) {
|
type_is_optimized_eltwise(cur_post_ops[next_post_op_idx + 1].op_type)) {
|
||||||
optimization_is_completed = true;
|
optimization_is_completed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cur_ops_pair_is_optimized = true;
|
cur_ops_pair_is_optimized = true;
|
||||||
}
|
}
|
||||||
} else if (eltw_and_scale) {
|
} else if (eltw_and_scale) {
|
||||||
dnnl::algorithm alg;
|
dnnl::algorithm alg;
|
||||||
float cur_scale, alpha, beta;
|
float eltw_scale, alpha, beta;
|
||||||
|
|
||||||
cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep);
|
cldnn::program_node& prev_node = arg.get_dependency(cur_post_ops[prev_post_op_idx].mem_dep);
|
||||||
|
|
||||||
p_ops.get_params_eltwise(cur_idx, cur_scale, alg, alpha, beta);
|
p_ops.get_params_eltwise(cur_idx, eltw_scale, alg, alpha, beta);
|
||||||
|
|
||||||
// Eltwise can be inserted into output_scale if cur_beta is equal to 0.f and cur_scale is equal to 1.0f
|
// Eltwise can be inserted into the output_scale if cur_beta is equal to 0.f
|
||||||
if (beta == 0.f && cur_scale == 1.0f && prev_node.get_output_layout().data_type == data_types::f32) {
|
if (beta == 0.f && prev_node.get_output_layout().data_type == data_types::f32) {
|
||||||
memory::ptr prev_scale_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
|
memory::ptr prev_scale_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
|
||||||
|
if (prev_scale_mem_ptr == nullptr)
|
||||||
|
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + scale");
|
||||||
auto& stream = prev_scale_mem_ptr->get_engine()->get_program_stream();
|
auto& stream = prev_scale_mem_ptr->get_engine()->get_program_stream();
|
||||||
mem_lock<float, mem_lock_type::write> eltw_and_scale_lock(prev_scale_mem_ptr, stream);
|
mem_lock<float, mem_lock_type::write> eltw_and_scale_lock(prev_scale_mem_ptr, stream);
|
||||||
|
|
||||||
@ -415,7 +454,7 @@ protected:
|
|||||||
|
|
||||||
// Update all scale coefficients
|
// Update all scale coefficients
|
||||||
for (size_t data_idx = 0; data_idx < prev_scale_mem_size; data_idx++) {
|
for (size_t data_idx = 0; data_idx < prev_scale_mem_size; data_idx++) {
|
||||||
eltw_and_scale_lock[data_idx] *= alpha;
|
eltw_and_scale_lock[data_idx] *= alpha * eltw_scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Marked current eltwise operation as 'optimized' (it will be ignored on the next iteration of cycle)
|
// Marked current eltwise operation as 'optimized' (it will be ignored on the next iteration of cycle)
|
||||||
|
@ -9433,6 +9433,364 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish_onednn,
|
|||||||
bc_test_params{CASE_CONV_S8S8_13, 2, 7},
|
bc_test_params{CASE_CONV_S8S8_13, 2, 7},
|
||||||
bc_test_params{CASE_CONV_S8S8_15, 2, 7},
|
bc_test_params{CASE_CONV_S8S8_15, 2, 7},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------------------------------------- */
|
||||||
|
/* ------------------------------ OneDNN post-ops cases with optimizations ----------------------------- */
|
||||||
|
/* ----------------------------------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
// Before optimization: eltw_linear + eltw_linear
|
||||||
|
// After optimization: eltw_linear
|
||||||
|
// Limitations: no
|
||||||
|
// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_linear:1:-128
|
||||||
|
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:-0.5
|
||||||
|
class post_ops_optimizations_onednn_eltw_linear_eltw_linear : public ConvFusingTestOneDNN {};
|
||||||
|
TEST_P(post_ops_optimizations_onednn_eltw_linear_eltw_linear, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p))),
|
||||||
|
data("bias", get_mem(get_bias_layout(p))),
|
||||||
|
data("in_lo", get_mem(get_single_element_layout(p), -10)),
|
||||||
|
data("in_hi", get_mem(get_single_element_layout(p), 10)),
|
||||||
|
data("out_lo", get_mem(get_single_element_layout(p), -128)),
|
||||||
|
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||||
|
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
|
||||||
|
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
tolerance = 1.f;
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_linear_eltw_linear,
|
||||||
|
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
// cases with batch = 1
|
||||||
|
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 16
|
||||||
|
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 32
|
||||||
|
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_14, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_15, 2, 3},
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Before optimization: eltw_non_linear + eltw_linear
|
||||||
|
// After optimization: eltw_non_linear
|
||||||
|
// Limitations: beta = 0 in eltw_linear
|
||||||
|
// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:2.00784+eltwise_clip:0:512
|
||||||
|
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round:0:0:2.00784+eltwise_clip:0:512
|
||||||
|
class post_ops_optimizations_onednn_eltw_non_linear_eltw_linear : public ConvFusingTestOneDNN {};
|
||||||
|
TEST_P(post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p))),
|
||||||
|
data("bias", get_mem(get_bias_layout(p))),
|
||||||
|
data("in_lo", get_mem(get_single_element_layout(p), -10)),
|
||||||
|
data("in_hi", get_mem(get_single_element_layout(p), 10)),
|
||||||
|
data("out_lo", get_mem(get_single_element_layout(p), 0)),
|
||||||
|
data("out_hi", get_mem(get_single_element_layout(p), 512)),
|
||||||
|
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::f32),
|
||||||
|
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
tolerance = 1.f;
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_non_linear_eltw_linear,
|
||||||
|
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
// cases with batch = 1
|
||||||
|
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 16
|
||||||
|
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 32
|
||||||
|
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_14, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_15, 2, 3},
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Before optimization: binary_add + eltw_linear
|
||||||
|
// After optimization: binary_add
|
||||||
|
// Limitations: alpha = 1 and scale = 1 in eltw_linear; binary_add is a constant compile-time buffer
|
||||||
|
// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_linear:1:-127+eltwise_clip:-127:127
|
||||||
|
// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_clip:-127:127
|
||||||
|
class post_ops_optimizations_onednn_binary_add_eltw_linear : public ConvFusingTestOneDNN {};
|
||||||
|
TEST_P(post_ops_optimizations_onednn_binary_add_eltw_linear, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p))),
|
||||||
|
data("bias", get_mem(get_bias_layout(p))),
|
||||||
|
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
|
||||||
|
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||||
|
data("out_lo", get_mem(get_single_element_layout(p), -127)),
|
||||||
|
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||||
|
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||||
|
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
tolerance = 1.f;
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_eltw_linear,
|
||||||
|
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
// cases with batch = 1
|
||||||
|
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 16
|
||||||
|
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 32
|
||||||
|
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_14, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_15, 2, 3},
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Before optimization: binary_mul + eltw_linear
|
||||||
|
// After optimization: binary_mul
|
||||||
|
// Limitations: beta = 0 in eltw_linear; binary_mul is a constant compile-time buffer
|
||||||
|
// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_linear:2.01575+eltwise_clip:0:512
|
||||||
|
// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_clip:0:512
|
||||||
|
class post_ops_optimizations_onednn_binary_mul_eltw_linear : public ConvFusingTestOneDNN {};
|
||||||
|
TEST_P(post_ops_optimizations_onednn_binary_mul_eltw_linear, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p))),
|
||||||
|
data("bias", get_mem(get_bias_layout(p))),
|
||||||
|
data("eltwise_data", get_mem(get_per_channel_layout(p), -1, 1)),
|
||||||
|
data("in_lo", get_mem(get_per_channel_layout(p), 0)),
|
||||||
|
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||||
|
data("out_lo", get_mem(get_single_element_layout(p), 0)),
|
||||||
|
data("out_hi", get_mem(get_single_element_layout(p), 512)),
|
||||||
|
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
eltwise("eltwise", { "conv_prim", "eltwise_data" }, eltwise_mode::prod),
|
||||||
|
quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||||
|
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
tolerance = 1.f;
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_mul_eltw_linear,
|
||||||
|
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
// cases with batch = 1
|
||||||
|
bc_test_params{CASE_CONV_U8S8_1, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_2, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_3, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_1, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_2, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_3, 2, 4},
|
||||||
|
|
||||||
|
// cases with batch = 16
|
||||||
|
bc_test_params{CASE_CONV_U8S8_9, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_10, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_9, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_10, 2, 4},
|
||||||
|
|
||||||
|
// cases with batch = 32
|
||||||
|
bc_test_params{CASE_CONV_U8S8_11, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_12, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_13, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_12, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_13, 2, 4},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_14, 2, 4},
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Before optimization: o_scale + eltw_linear
|
||||||
|
// After optimization: o_scale
|
||||||
|
// Limitations: beta = 0 in eltw_linear
|
||||||
|
// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:eltwise_linear:2.01575+eltwise_clip:0:512
|
||||||
|
// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:eltwise_clip:0:512
|
||||||
|
class post_ops_optimizations_onednn_oscale_eltw_linear : public ConvFusingTestOneDNN {};
|
||||||
|
TEST_P(post_ops_optimizations_onednn_oscale_eltw_linear, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p))),
|
||||||
|
data("bias", get_mem(get_bias_layout(p))),
|
||||||
|
data("in_lo", get_mem(get_per_channel_layout(p), 0)),
|
||||||
|
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||||
|
data("out_lo", get_mem(get_single_element_layout(p), 0)),
|
||||||
|
data("out_hi", get_mem(get_single_element_layout(p), 512)),
|
||||||
|
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||||
|
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
tolerance = 1.f;
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_oscale_eltw_linear,
|
||||||
|
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
// cases with batch = 1
|
||||||
|
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 16
|
||||||
|
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 32
|
||||||
|
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Before optimization: eltw_any + sum + eltw_linear
|
||||||
|
// After optimization: eltw_any + sum
|
||||||
|
// Limitations: beta = 0 in eltw_linear
|
||||||
|
// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_relu+sum:1:0:u8+eltwise_linear:12.7+eltwise_clip:0:127
|
||||||
|
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_relu:0:0:12.7+sum:12.7:0:u8+eltwise_clip:0:127
|
||||||
|
class post_ops_optimizations_onednn_eltw_any_sum_eltw_linear : public ConvFusingTestOneDNN {};
|
||||||
|
TEST_P(post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p))),
|
||||||
|
data("bias", get_mem(get_bias_layout(p))),
|
||||||
|
data("in_lo", get_mem(get_single_element_layout(p), 0)),
|
||||||
|
data("in_hi", get_mem(get_single_element_layout(p), 10)),
|
||||||
|
data("out_lo", get_mem(get_single_element_layout(p), 0)),
|
||||||
|
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||||
|
data("eltwise_data", get_mem(get_output_layout(p))),
|
||||||
|
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
activation("activation", "conv_prim", activation_func::relu_negative_slope),
|
||||||
|
eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
|
||||||
|
quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 128, data_types::u8),
|
||||||
|
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
tolerance = 1.f;
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_any_sum_eltw_linear,
|
||||||
|
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
// cases with batch = 1
|
||||||
|
bc_test_params{CASE_CONV_U8S8_1, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_2, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_3, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_1, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_2, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_3, 2, 5},
|
||||||
|
|
||||||
|
// cases with batch = 16
|
||||||
|
bc_test_params{CASE_CONV_U8S8_10, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_10, 2, 5},
|
||||||
|
|
||||||
|
// cases with batch = 32
|
||||||
|
bc_test_params{CASE_CONV_U8S8_11, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_12, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_13, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_14, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_12, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_13, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_14, 2, 5},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_15, 2, 5},
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Input range uses in 2 cases: not per-tensor output range or out_lo > out_hi
|
||||||
|
// Here's out_lo > out_hi and no optimizations
|
||||||
|
// DNNL_VERBOSE log: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:-1:127
|
||||||
|
class post_ops_optimizations_input_range : public ConvFusingTestOneDNN {};
|
||||||
|
TEST_P(post_ops_optimizations_input_range, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p))),
|
||||||
|
data("bias", get_mem(get_bias_layout(p))),
|
||||||
|
data("in_lo", get_mem(get_single_element_layout(p), -10)),
|
||||||
|
data("in_hi", get_mem(get_single_element_layout(p), 10)),
|
||||||
|
data("out_lo", get_mem(get_single_element_layout(p), 127)),
|
||||||
|
data("out_hi", get_mem(get_single_element_layout(p), -128)),
|
||||||
|
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
|
||||||
|
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
tolerance = 1.f;
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range,
|
||||||
|
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
// cases with batch = 1
|
||||||
|
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 16
|
||||||
|
bc_test_params{CASE_CONV_U8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_10, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_9, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_10, 2, 3},
|
||||||
|
|
||||||
|
// cases with batch = 32
|
||||||
|
bc_test_params{CASE_CONV_U8S8_11, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_U8S8_14, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_12, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_13, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_14, 2, 3},
|
||||||
|
bc_test_params{CASE_CONV_S8S8_15, 2, 3},
|
||||||
|
}));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user