[GPU] Code refactoring to choose between binary_add and sum (#10724)

+ Fix colorization-sig accuracy issue using oneDNN
	Memory crash in case reuse_eltwise_sum_post in oneDNN and memory_pool
	And print node in/out gpu_usm_mem addr at OV_GPU_Verbose >= 1
+ Check the size of z spatial axis for checking fulltensor.
+ Remove program_helpers's functions.

Co-authored-by: hyunback <hyunback.kim@intel.com>
This commit is contained in:
Jade Cho
2022-03-22 14:58:36 +09:00
committed by GitHub
parent e8288eb31d
commit a7df1531db
13 changed files with 356 additions and 249 deletions

View File

@@ -34,6 +34,8 @@ struct memory {
virtual void unlock(const stream& stream) = 0;
virtual event::ptr fill(stream& stream, unsigned char pattern) = 0;
virtual event::ptr fill(stream& stream) = 0;
// only supports gpu_usm
virtual void* buffer_ptr() const { return nullptr; }
size_t size() const { return _bytes_count; }
size_t count() const { return _layout.count(); }

View File

@@ -36,39 +36,26 @@ void basic_memory_dependencies::run(program& p) {
add_memory_dependency(it, node);
}
if (node->is_type<convolution>() && node->get_preferred_impl_type() == impl_types::onednn) {
auto& conv = node->as<convolution>();
bool can_reuse_eltwise_mem = false;
if (node->get_preferred_impl_type() == impl_types::onednn
&& (node->is_type<convolution>() || node->is_type<deconvolution>())) {
size_t eltw_dep = 0;
for (auto& fused_op : conv.get_fused_primitives()) {
for (auto& fused_op : node->get_fused_primitives()) {
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
auto eltw_in_layout = conv.get_dependency(fused_op.dep_start_idx).get_output_layout();
auto conv_out_layout = node->get_output_layout();
if (eltw_dep > 0) {
can_reuse_eltwise_mem = false;
break;
}
// If it is first sum, reuse the buffer
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;
if (eltw_in_layout.size == conv_out_layout.size &&
eltw_in_layout.format == conv_out_layout.format &&
eltw_in_layout.data_padding == conv_out_layout.data_padding &&
data_type_traits::size_of(eltw_in_layout.data_type) == data_type_traits::size_of(conv_out_layout.data_type)) {
eltw_dep = fused_op.dep_start_idx;
can_reuse_eltwise_mem = true;
eltw_dep = fused_op.dep_start_idx;
auto& eltw_node = node->get_dependency(eltw_dep);
eltw_node.can_share_buffer(false);
node->can_share_buffer(false);
for (auto& user : node->get_users()) {
add_memory_dependency(user, &eltw_node);
add_memory_dependency(user, node);
}
}
}
if (can_reuse_eltwise_mem) {
auto& eltw_node = conv.get_dependency(eltw_dep);
eltw_node.can_share_buffer(false);
conv.can_share_buffer(false);
for (auto& user : conv.get_users()) {
add_memory_dependency(user, &eltw_node);
add_memory_dependency(user, &conv);
}
}
}
// Note we iterate over processing order, it means if primitve has processing num greater than any of outputs,

View File

@@ -93,16 +93,11 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
for (auto& input : node.get_dependencies()) {
if (input->get_preferred_impl_type() == impl_types::onednn) {
for (auto& fused_op : input->get_fused_primitives()) {
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
auto& eltw_in = input->get_dependency(fused_op.dep_start_idx);
auto eltw_in_layout = eltw_in.get_output_layout();
auto out_layout = input->get_output_layout();
if (!program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout))
continue;
if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout))
return false;
}
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(*input, fused_op);
if (add_type == add_fusing_type::sum)
return false;
else
continue;
}
is_onednn_impl = true;
}

View File

@@ -649,44 +649,38 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
// When the conv node is of onednn impl type and eltwise sum with full tensor is fused,
// changes the input format of eltwise sum post-op to use binary add.
if (conv_node.get_preferred_impl_type() == impl_types::onednn) {
std::vector<size_t> eltw_sum_dep_indices;
for (size_t i = 1; i < conv_node.get_dependencies().size(); i++) {
auto& dep = conv_node.get_dependency(i);
for (auto& fused_op : conv_node.get_fused_primitives()) {
if (fused_op.node->is_type<eltwise>()
&& fused_op.node->as<eltwise>().get_primitive()->mode == eltwise_mode::sum
&& !program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(),
conv_node.get_dependency(fused_op.dep_start_idx).get_output_layout())
&& conv_node.get_dependency(fused_op.dep_start_idx).get_users().size() == 1
&& conv_node.get_dependency(fused_op.dep_start_idx).id() == dep.id()) {
eltw_sum_dep_indices.push_back(i);
}
}
}
onednn_add_fusing_helpers::for_eltwise(conv_node, eltwise_mode::sum,
[&](const program_node& p_node, const eltwise_node& e_node, const fused_primitive_desc& desc) {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(p_node, desc);
if (fusing_type == add_fusing_type::binary_per_tensor) {
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
auto d_layout = dep_node.get_output_layout();
auto d_format = d_layout.format;
auto expected_format = format::any;
auto conv_layout = conv_node.get_output_layout();
for (auto& dep_id : eltw_sum_dep_indices) {
auto& prev_node = conv_node.get_dependency(dep_id);
auto old_layout = prev_node.get_output_layout();
auto expected_format = format::any;
if ((conv_layout.data_type == data_types::f16 || conv_layout.data_type == data_types::f32)
&& data_type_traits::is_i8_u8(old_layout.data_type)) {
if (conv_layout.format == format::b_fs_yx_fsv16)
expected_format = format::b_fs_yx_fsv32;
if (conv_layout.format == format::bs_fs_yx_bsv32_fsv16)
expected_format = format::bs_fs_yx_bsv32_fsv32;
}
if (data_type_traits::is_i8_u8(d_layout.data_type)) {
if (d_format == format::b_fs_yx_fsv16)
expected_format = format::b_fs_yx_fsv32;
else if (d_format == format::bs_fs_yx_bsv32_fsv16)
expected_format = format::bs_fs_yx_bsv32_fsv32;
} else if (data_type_traits::is_floating_point(d_layout.data_type)) {
if (d_format == format::b_fs_yx_fsv32)
expected_format = format::b_fs_yx_fsv16;
else if (d_format == format::bs_fs_yx_bsv32_fsv32)
expected_format = format::bs_fs_yx_bsv32_fsv16;
}
if (expected_format != format::any && old_layout.format != expected_format) {
auto new_layout = old_layout;
new_layout.format = expected_format;
auto new_input = rf.get_reorder(prev_node.id(), old_layout, new_layout);
if (new_input.first) {
p.add_intermediate(new_input.first, conv_node, dep_id, !new_input.second);
if (expected_format != format::any && d_layout.format != expected_format) {
auto new_layout = d_layout;
new_layout.format = expected_format;
auto new_input = rf.get_reorder(dep_node.id(), d_layout, new_layout);
if (new_input.first) {
p.add_intermediate(new_input.first, conv_node, desc.dep_start_idx, !new_input.second);
}
conv_node.get_dependency(desc.dep_start_idx).set_output_layout(new_layout, false);
}
}
conv_node.get_dependency(dep_id).set_output_layout(new_layout, false);
}
}
});
}
};

View File

@@ -11,10 +11,12 @@
#include "intel_gpu/graph/program.hpp"
#include "data_inst.h"
#include "eltwise_inst.h"
#include "convolution_inst.h"
#include <string>
#include <vector>
#include <utility>
#include <iostream>
namespace cldnn {
struct program_helpers {
@@ -125,12 +127,35 @@ struct program_helpers {
}
}
static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
static bool are_layouts_identical_for_onednn_sum_post_op(layout input_layout, layout output_layout);
static bool needs_onednn_sum_post_op(const eltwise_node& n, layout input_layout);
};
struct onednn_add_fusing_helpers {
enum class add_fusing_type {
sum,
binary_per_tensor,
binary_per_oc,
not_supported,
};
static bool is_full_tensor(const layout& layout);
static std::vector<fused_primitive_desc> get_fused_eltwise_primitives();
static void for_eltwise(const program_node& conv_node, eltwise_mode mode,
std::function<void(const program_node&, const eltwise_node&, const fused_primitive_desc&)> func);
static add_fusing_type get_add_fusing_type(const program_node& node, const fused_primitive_desc& desc);
};
using add_fusing_type = onednn_add_fusing_helpers::add_fusing_type;
static inline std::ostream& operator<< (std::ostream& os, add_fusing_type& t) {
switch (t) {
case add_fusing_type::sum: os << "sum"; break;
case add_fusing_type::binary_per_tensor: os << "binary_per_tensor"; break;
case add_fusing_type::binary_per_oc: os << "binary_per_oc"; break;
default: os << "not_supported"; break;
}
return os;
}
// Base class for performing pattern match style optimizations.
// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`,
// and overload match and optimize methods.

View File

@@ -56,6 +56,30 @@ enum class onednn_post_op_type : uint32_t {
optimized_sum
};
static inline std::ostream& operator<< (std::ostream& os, onednn_post_op_type& t) {
switch (t) {
case onednn_post_op_type::eltwise_act: os << "eltwise_act"; break;
case onednn_post_op_type::eltwise_clip: os << "eltwise_clip"; break;
case onednn_post_op_type::eltwise_linear: os << "eltwise_linear"; break;
case onednn_post_op_type::eltwise_round: os << "eltwise_round"; break;
case onednn_post_op_type::binary_mul: os << "binary_mul"; break;
case onednn_post_op_type::binary_add: os << "binary_add"; break;
case onednn_post_op_type::binary_max: os << "binary_max"; break;
case onednn_post_op_type::binary_min: os << "binary_min"; break;
case onednn_post_op_type::binary_relu: os << "binary_relu"; break;
case onednn_post_op_type::scale: os << "scale"; break;
case onednn_post_op_type::sum: os << "sum"; break;
case onednn_post_op_type::optimized: os << "optimized"; break;
case onednn_post_op_type::optimized_eltwise_act: os << "optimized_eltwise_act"; break;
case onednn_post_op_type::optimized_eltwise_clip: os << "optimized_eltwise_clip"; break;
case onednn_post_op_type::optimized_eltwise_linear: os << "optimized_eltwise_linear"; break;
case onednn_post_op_type::optimized_eltwise_round: os << "optimized_eltwise_round"; break;
case onednn_post_op_type::optimized_sum: os << "optimized_sum"; break;
default: os << "invalid";
}
return os;
}
struct fused_primitive_desc_onednn {
onednn_post_op_type op_type; // onednn post-operation type
size_t mem_offset; // index of a memory buffer for current post-operation

View File

@@ -198,12 +198,11 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
// Not to fuse reorder if this removal changes input format of its next node which has reuse in fused_op
if (next.get_preferred_impl_type() == impl_types::onednn) {
for (auto& fused_op : next.get_fused_primitives()) {
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
if (fused_op.node->is_type<eltwise>()) {
auto eltw_in_layout = next.get_dependency(fused_op.dep_start_idx).get_output_layout();
auto out_layout = next.get_output_layout();
if (program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout) &&
program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout) &&
prev.get_output_layout().format != out_layout.format)
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(next, fused_op);
if (add_type == add_fusing_type::sum && prev.get_output_layout().format != out_layout.format)
return false;
}
}
@@ -947,23 +946,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
bool use_onednn_impls = _optimization_attributes.use_onednn_impls && input_layout.data_type != data_types::f32;
bool i8_u8_input = input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8;
if (use_onednn_impls && onednn_valid_post_ops) {
for (auto& fo : node.get_fused_primitives()) {
if (fo.node->is_type<eltwise>()) {
auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
auto out_layout = node.get_output_layout();
auto in_dt = in_layout.data_type;
auto out_dt = out_layout.data_type;
if ((out_layout.count() == in_layout.count()) &&
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
onednn_valid_post_ops = false;
break;
}
}
}
}
if (use_onednn_impls && onednn_valid_post_ops) {
std::function<bool(const program_node&)> has_any_convolutions_below;
has_any_convolutions_below = [&](const program_node& node) -> bool {
@@ -1373,23 +1355,6 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
impl_candidate = impl_types::ocl;
}
// [WA] to avoid an onednn kernel issue of multiple sum post-ops
if (!node.get_fused_primitives().empty()) {
size_t sum_post_op_cnt = 0;
for (auto& fused_op : node.get_fused_primitives()) {
if (fused_op.node->is_type<eltwise>() && node.get_dependencies().size() > fused_op.dep_start_idx && fused_op.deps.size() == 1) {
auto& eltw_in = node.get_dependency(fused_op.dep_start_idx);
if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in.get_output_layout(), node.get_output_layout()) &&
program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in.get_output_layout())) {
if (sum_post_op_cnt > 0)
return impl_types::ocl;
sum_post_op_cnt += 1;
}
}
}
}
if (node.is_type<convolution>()) {
// oneDNN doesn't have good support for groups with fsv16 fmt
auto& conv = node.as<convolution>();
@@ -1418,29 +1383,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
impl_candidate = impl_types::ocl;
}
size_t eltw_dep = 0;
for (auto& fo : node.get_fused_primitives()) {
if (fo.node->is_type<eltwise>()) {
auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
auto out_layout = node.get_output_layout();
auto in_dt = in_layout.data_type;
auto out_dt = out_layout.data_type;
if (program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
if ((out_layout.count() == in_layout.count()) &&
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
impl_candidate = impl_types::ocl;
break;
}
if (in_layout.size == out_layout.size && in_layout.format == out_layout.format && in_layout.data_padding == out_layout.data_padding &&
data_type_traits::size_of(in_dt) == data_type_traits::size_of(out_dt)) {
if (eltw_dep > 0) {
impl_candidate = impl_types::ocl;
break;
}
eltw_dep = fo.dep_start_idx;
}
}
} else if (fo.node->is_type<activation>()) {
if (fo.node->is_type<activation>()) {
// Some activations aren't implemented in oneDNN
auto activation_prim = fo.node->as<activation>().get_primitive();
if (activation_prim->activation_function == activation_func::negative ||
@@ -1486,15 +1430,17 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
auto out_layout = node.get_output_layout();
auto in_dt = in_layout.data_type;
auto out_dt = out_layout.data_type;
if ((out_layout.count() == in_layout.count()) &&
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
// if it is not eltwise sum and input is full tensor
if ((out_layout.count() == in_layout.count()) && in_dt != out_dt
&& (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt))
&& onednn_add_fusing_helpers::is_full_tensor(in_layout)) {
impl_candidate = impl_types::ocl;
break;
}
if (fo.node->as<eltwise>().get_primitive()->mode == eltwise_mode::sum &&
program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
// WA: onednn sum/binary_add post-op are not supported due to perf drop.
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(node, fo);
if (add_type == add_fusing_type::sum || add_type == add_fusing_type::binary_per_tensor || add_type == add_fusing_type::binary_per_oc) {
impl_candidate = impl_types::ocl;
break;
}

View File

@@ -536,54 +536,22 @@ void network::allocate_primitives() {
for (auto const& node : _program->get_processing_order()) {
if (node->get_preferred_impl_type() == impl_types::onednn) {
bool can_reuse_eltwise_mem = false;
size_t eltw_dep = 0;
for (auto& fused_op : node->get_fused_primitives()) {
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
auto& eltw_in = node->get_dependency(fused_op.dep_start_idx);
auto eltw_in_layout = eltw_in.get_output_layout();
auto out_layout = node->get_output_layout();
if (!program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout))
// If it is first sum, reuse the buffer
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;
if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout)) {
if (eltw_dep > 0)
throw std::runtime_error("Unsupported multiple full size tensors.");
eltw_dep = fused_op.dep_start_idx;
can_reuse_eltwise_mem = true;
eltw_dep = fused_op.dep_start_idx;
auto& eltw_in = node->get_dependency(eltw_dep);
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
auto& eltw_inst = _primitives.at(eltw_in.id());
auto& prim_inst = _primitives.at(node->id());
auto& eltw_mem = eltw_inst->output_memory();
auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
prim_inst->set_output_memory(new_mem);
}
if (!can_reuse_eltwise_mem) {
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
auto& eltw_inst = _primitives.at(eltw_in.id());
auto& prim_inst = _primitives.at(node->id());
auto eltw_mem_type = eltw_inst->output_memory().get_allocation_type();
auto prim_mem_type = prim_inst->output_memory().get_allocation_type();
// Keep lockable memory type for `prim_inst` output if needed
if (eltw_mem_type != prim_mem_type && eltw_mem_type != allocation_type::cl_mem && eltw_mem_type != allocation_type::usm_host)
can_reuse_eltwise_mem = false;
}
}
if (program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout) && !can_reuse_eltwise_mem) {
throw std::runtime_error("Buffer reuse is required for onednn sum post operation.");
}
}
}
if (can_reuse_eltwise_mem) {
auto& eltw_in = node->get_dependency(eltw_dep);
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
auto& eltw_inst = _primitives.at(eltw_in.id());
auto& prim_inst = _primitives.at(node->id());
auto& eltw_mem = eltw_inst->output_memory();
auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
prim_inst->set_output_memory(new_mem);
}
}
}
@@ -698,8 +666,21 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
}
GPU_DEBUG_IF(debug_config->verbose >= 1) {
std::ostringstream in_addr;
// buffer_ptr() only support usm_memory
for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
auto& in_mem = get_primitive(inst->id())->dep_memory(i);
in_addr << in_mem.buffer_ptr();
if (i < get_primitive(inst->id())->dependencies().size() - 1) {
in_addr << ", ";
}
}
auto& out_mem = get_primitive(inst->id())->output_memory();
GPU_DEBUG_COUT << "Execute " << inst->id() << ", memory type: "
<< inst->output_memory().get_allocation_type() << std::endl;
<< inst->output_memory().get_allocation_type() << ", in_usm("
<< in_addr.str() << "), out_usm("
<< out_mem.buffer_ptr() << ")" << std::endl;
}
// If a node has mutable input or it's an output, then the input/output buffers might be changed

View File

@@ -7,9 +7,11 @@
#include "program_helpers.h"
#include "intel_gpu/graph/program.hpp"
#include "data_inst.h"
#include "pooling_inst.h"
#include <algorithm>
#include <utility>
#include <vector>
#include <sstream>
namespace cldnn {
// helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization
@@ -181,28 +183,52 @@ std::pair<bool, bool> program_helpers::are_layouts_identical(layout const& l1, l
return {false, false};
}
// check if input and output layouts are identical to reuse memory in fused_ops of onednn
bool program_helpers::are_layouts_identical_for_onednn_sum_post_op(layout input_layout, layout output_layout) {
if (input_layout.size == output_layout.size && input_layout.format == output_layout.format &&
input_layout.data_padding == output_layout.data_padding &&
data_type_traits::size_of(input_layout.data_type) == data_type_traits::size_of(output_layout.data_type))
return true;
return false;
}
bool program_helpers::needs_onednn_sum_post_op(const eltwise_node& n, layout input_layout) {
auto output_layout = n.get_output_layout();
if (n.get_primitive()->mode == eltwise_mode::sum &&
(input_layout.size.spatial[0] > 1 || input_layout.size.spatial[1] > 1 || input_layout.size.batch[0] > 1)
&& output_layout.data_type == input_layout.data_type) {
bool onednn_add_fusing_helpers::is_full_tensor(const layout& l) {
if (l.size.spatial[0] > 1 || l.size.spatial[1] > 1 || (l.get_spatial_rank() == 3 && l.size.spatial[2] > 1)
|| l.size.batch[0] > 1) {
return true;
}
return false;
}
void onednn_add_fusing_helpers::for_eltwise(
const program_node& node, eltwise_mode mode,
std::function<void(const program_node& p_node, const eltwise_node& e_node,
const fused_primitive_desc& desc)> func) {
for (auto& fo : node.get_fused_primitives()) {
if (fo.node->is_type<eltwise>() && fo.node->as<eltwise>().get_primitive()->mode == mode) {
func(node, fo.node->as<eltwise>(), fo);
}
}
}
add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type(
const program_node& p_node, const fused_primitive_desc& desc) {
if (!desc.node->is_type<eltwise>() || desc.node->as<eltwise>().get_primitive()->mode != eltwise_mode::sum) {
return add_fusing_type::not_supported;
}
auto& eltw_node = desc.node->as<eltwise>();
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
auto p_layout = p_node.get_output_layout();
auto e_layout = eltw_node.get_output_layout();
auto d_layout = dep_node.get_output_layout();
if (is_full_tensor(p_layout) && is_full_tensor(d_layout)) {
if (data_type_traits::size_of(p_layout.data_type) == data_type_traits::size_of(d_layout.data_type)
&& p_layout.format == d_layout.format && p_layout.size == d_layout.size
&& p_layout.data_padding == d_layout.data_padding
&& dep_node.get_users().size() == 1
&& !p_node.is_type<pooling>()) {
return add_fusing_type::sum;
} else if (p_layout.size == d_layout.size) {
return add_fusing_type::binary_per_tensor;
}
}
return add_fusing_type::binary_per_oc;
}
} // namespace cldnn

View File

@@ -7,6 +7,7 @@
#include "primitive_inst.h"
#include "loop_inst.h"
#ifdef ENABLE_ONEDNN_FOR_GPU
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "convolution_inst.h"
#include "quantize_inst.h"
#include "reorder_inst.h"
@@ -345,6 +346,8 @@ bool program_node::has_out_scales(const std::shared_ptr<dnnl::primitive_attr>& a
dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr<dnnl::primitive_attr>& attr,
bool& optimization_is_completed) {
GPU_DEBUG_GET_INSTANCE(debug_config);
// Create new dnnl::post_ops object which will be filled inside the optimization process
dnnl::post_ops optimized_p_ops;
@@ -393,6 +396,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
float scale;
dnnl::memory::data_type data_type;
cur_p_ops.get_params_sum(idx, scale, data_type);
// Only conv supports data type specification in append_sum. Other primitives(deconv, fc) do not support it.
if (is_type<convolution>()) {
new_p_ops.append_sum(scale, data_type);
} else {
@@ -419,7 +423,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
// Check that post-op type is any optimized
auto type_is_any_optimized = [](onednn_post_op_type type) -> bool {
return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum ||
return type == onednn_post_op_type::optimized ||
type == onednn_post_op_type::optimized_sum ||
type == onednn_post_op_type::optimized_eltwise_act ||
type == onednn_post_op_type::optimized_eltwise_linear ||
type == onednn_post_op_type::optimized_eltwise_clip ||
@@ -462,20 +467,45 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
}
};
auto remove_optimized_prefix = [&](std::vector<fused_primitive_desc_onednn>& post_ops) {
// Check and update post-op map if we already optimized something
auto iter = post_ops.begin();
while (iter != post_ops.end()) {
if (type_is_optimized_sum(iter->op_type)) {
iter->op_type = onednn_post_op_type::sum;
++iter;
} else if (type_is_optimized_eltwise(iter->op_type)) {
iter->op_type = get_eltwise_type(iter->op_type);
++iter;
} else if (type_is_optimized(iter->op_type)) {
iter = post_ops.erase(iter);
} else {
++iter;
}
}
};
auto& cur_post_ops = get_fused_primitives_onednn();
size_t cur_post_op_idx = 1;
size_t prev_post_op_idx = 0;
bool optimization_done = false;
// Check and update post-op map if we already optimized something
for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) {
if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type))
cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum;
else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type))
cur_post_ops[post_op_idx].op_type = get_eltwise_type(cur_post_ops[post_op_idx].op_type);
else if (type_is_optimized(cur_post_ops[post_op_idx].op_type))
cur_post_ops.erase(cur_post_ops.begin() + post_op_idx);
GPU_DEBUG_IF(debug_config->verbose >= 3) {
GPU_DEBUG_COUT << "================================================" << std::endl;
GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
}
remove_optimized_prefix(cur_post_ops);
GPU_DEBUG_IF(debug_config->verbose >= 3) {
GPU_DEBUG_COUT << "remove optimized prefix ------------------------" << std::endl;
GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
GPU_DEBUG_COUT << "----------------------------------->>>>>>>>>>>>>" << std::endl;
}
// Get post-ops size for current node
@@ -498,6 +528,9 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
auto cur_type = cur_post_ops[cur_post_op_idx].op_type;
auto prev_type = cur_post_ops[prev_post_op_idx].op_type;
GPU_DEBUG_IF(debug_config->verbose >= 3)
GPU_DEBUG_COUT << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
// Ignore optimized operations for "previous" operation in our operation pair
while (type_is_any_optimized(prev_type) && prev_post_op_idx < post_ops_size - 1) {
prev_post_op_idx++;
@@ -513,9 +546,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
cur_type = cur_post_ops[cur_post_op_idx].op_type;
}
GPU_DEBUG_IF(debug_config->verbose >= 3)
GPU_DEBUG_COUT << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
// if 2 indices are same, add the last post-op to dnnl::post_ops
if (prev_idx == post_ops_size - 1 && prev_idx == cur_idx && !type_is_any_optimized(prev_type)) {
add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx);
break;
}
// If this is the last pair and it's optimized - add the last post-op and go out from the cycle
if (cur_post_op_idx == post_ops_size - 1 && (type_is_any_optimized(cur_type) || type_is_any_optimized(prev_type))) {
if (!type_is_any_optimized(prev_type)) {
@@ -542,6 +584,11 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
bool cur_ops_pair_is_optimized = false;
GPU_DEBUG_IF(debug_config->verbose >= 3) {
GPU_DEBUG_COUT << "prev_idx: " << prev_idx << " " << prev_type
<< ", cur_idx: " << cur_idx << " " << cur_type << std::endl;
}
if (can_try_optimize) {
if (eltw_and_eltw) {
dnnl::algorithm cur_alg, prev_alg;
@@ -701,6 +748,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
dnnl::post_ops eltw_p_op_prev, sum_p_op;
eltw_p_op_prev.append_eltwise(eltw_scale * next_alpha * next_scale, alg, alpha, beta);
// Only conv supports data type specification in append_sum. Other primitives(deconv, fc) do not support it.
if (is_type<convolution>()) {
sum_p_op.append_sum(sum_scale * next_alpha, data_type);
} else {
@@ -769,7 +817,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
}
}
// if optimization_is_completed is true, try to optimize again.
optimization_is_completed = !optimization_is_completed;
if (optimization_is_completed) {
remove_optimized_prefix(cur_post_ops);
}
GPU_DEBUG_IF(debug_config->verbose >= 3) {
GPU_DEBUG_COUT << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
GPU_DEBUG_COUT << "------------------------------------------------" << std::endl;
}
add_onednn_fused_primitives(cur_post_ops);
@@ -805,6 +864,7 @@ void program_node::init_onednn_primitive_attributes() {
memory_offset++;
};
int32_t num_sum_post_ops = 0;
for (size_t idx = 0; idx < cldnn_post_ops.size(); idx++) {
auto node = cldnn_post_ops[idx].node;
@@ -834,13 +894,11 @@ void program_node::init_onednn_primitive_attributes() {
auto in = get_dependency(dep_idx).get_output_layout();
if (e_node.get_primitive()->mode == eltwise_mode::sum) {
if (program_helpers::needs_onednn_sum_post_op(e_node, in)) {
if (is_type<convolution>()) {
post_ops.append_sum(1.0f, onednn::convert_data_type(in.data_type));
} else {
post_ops.append_sum(1.0f);
}
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*this, cldnn_post_ops[idx]);
if (fusing_type == add_fusing_type::sum && num_sum_post_ops == 0) {
post_ops.append_sum(1.0f);
update_onednn_post_op_list(onednn_post_op_type::sum, dep_idx);
num_sum_post_ops++;
} else {
dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in);
post_ops.append_binary(dnnl::algorithm::binary_add, in_desc);

View File

@@ -107,6 +107,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
void unlock(const stream& stream) override;
const cl::UsmMemory& get_buffer() const { return _buffer; }
cl::UsmMemory& get_buffer() { return _buffer; }
void* buffer_ptr() const override { return _buffer.get(); }
event::ptr fill(stream& stream, unsigned char pattern) override;
event::ptr fill(stream& stream) override;

View File

@@ -3410,68 +3410,130 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range, ::test
convolution_test_params{ CASE_CONV_S8S8_15, 2, 3 },
}));
struct convolution_eltw_sum_test_params {
tensor in_shape;
tensor out_shape;
tensor kernel;
ov::Strides stride;
ov::CoordinateDiff pad;
ov::Strides dilation;
uint32_t groups;
data_types data_type;
format input_format;
data_types weights_type;
format weights_format;
data_types eltw_type;
format eltw_format;
data_types out_type;
format out_format;
data_types default_type;
format default_format;
size_t expected_fused_primitives;
size_t expected_not_fused_primitives;
};
// input:b_fs_yx_fsv32:u8 X weight:bfyx:i8 + eltwise_sum:b_fs_yx_fsv32:u8
// After optimization: eltwise_any + binary_add
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_tanh+binary_add:u8:14:aBcd32b+eltwise_linear:1
class post_ops_optimizations_onednn_binary_add_full_tensor : public WeightsPrimitiveFusingTestOneDNN {};
TEST_P(post_ops_optimizations_onednn_binary_add_full_tensor, basic) {
class EltwiseSumFusingTestOneDNN : public BaseFusingTest<convolution_eltw_sum_test_params> {
public:
void execute(convolution_eltw_sum_test_params& p) {
auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p));
network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
network network_fused(this->engine, this->topology_fused, bo_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
compare(network_not_fused, network_fused, p);
auto pi_fused = network_fused.get_primitives_info();
auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), [](primitive_info& p) -> bool {
if (p.original_id == "conv_prim")
return true;
return false;
});
if (info_fused != pi_fused.end() && engine.get_device_info().supports_immad) {
std::cout << "kernel: " << info_fused->kernel_id << std::endl;
EXPECT_TRUE(info_fused->kernel_id.find("jit:ir") != std::string::npos);
}
}
layout get_input_layout(convolution_eltw_sum_test_params& p) {
auto pad = p.pad;
std::vector<int> pad_ = { 0, 0, static_cast<int>(pad[0]), static_cast<int>(pad[1]) };
return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
}
layout get_per_channel_layout(convolution_eltw_sum_test_params& p) {
return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } };
}
};
class onednn_binary_add_full_tensor : public EltwiseSumFusingTestOneDNN {};
TEST_P(onednn_binary_add_full_tensor, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_single_element_layout(p), 0)),
data("in_hi", get_mem(get_single_element_layout(p), 255)),
data("out_lo", get_mem(get_single_element_layout(p), 0)),
data("out_hi", get_mem(get_single_element_layout(p), 255)),
data("eltwise_data", get_mem(get_output_layout(p), 0, 255)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
data("in_lo1", get_mem(get_single_element_layout(p), 0)),
data("in_hi1", get_mem(get_single_element_layout(p), 100)),
data("out_lo1", get_mem(get_single_element_layout(p), 0)),
data("out_hi1", get_mem(get_single_element_layout(p), 100)),
data("eltwise_data", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, false),
activation("activation", "conv_prim", activation_func::hyperbolic_tan),
eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8),
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
quantize("quantize1", "activation", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 256, p.out_type),
eltwise("sum", { "quantize1", "eltwise_data" }, eltwise_mode::sum, p.out_type),
reorder("reorder_bfyx", "sum", p.default_format, p.default_type)
);
tolerance = 1.f;
execute(p);
}
// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format;
#define CASE_CONV_U8S8_FT_BINARY_ADD_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, tensor{ 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::f32, format::bfyx
// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; eltw_type; eltw_format; out_type; out_format; default_type; default_format;
#define CASE_CONV_ELTW_SUM_BINARY_ADD_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, { 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::u8, format::b_fs_yx_fsv32, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_CONV_ELTW_SUM_SUM_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, { 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::u8, format::b_fs_yx_fsv32, data_types::u8, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_test_params>{
// cases with batch = 1
convolution_test_params{ CASE_CONV_U8S8_FT_BINARY_ADD_1, 2, 5 },
INSTANTIATE_TEST_SUITE_P(eltwise_sum_fusings_gpu, onednn_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_eltw_sum_test_params>{
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_BINARY_ADD_1, 2, 5 },
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_SUM_1, 2, 5 },
}));
// input:b_fs_yx_fsv16:f16 X weight:bfyx:f16 + eltwise_sum:b_fs_yx_fsv16:f16
// After optimization: eltwise_any + sum
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_tanh+sum:1:0:f16
class post_ops_optimizations_onednn_sum_full_tensor : public WeightsPrimitiveFusingTestOneDNN {};
TEST_P(post_ops_optimizations_onednn_sum_full_tensor, basic) {
class onednn_multiple_binary_add_full_tensor : public EltwiseSumFusingTestOneDNN {};
TEST_P(onednn_multiple_binary_add_full_tensor, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
data("in_lo1", get_mem(get_single_element_layout(p), 0)),
data("in_hi1", get_mem(get_single_element_layout(p), 100)),
data("out_lo1", get_mem(get_single_element_layout(p), 0)),
data("out_hi1", get_mem(get_single_element_layout(p), 100)),
data("eltwise_data", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
data("eltwise_data1", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
data("eltwise_data2", get_mem(layout{ p.eltw_type, format::bfyx, tensor{ 1, p.out_shape.feature[0], 1, 1 } }, 0, 100)),
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, false),
activation("activation", "conv_prim", activation_func::hyperbolic_tan),
data("eltwise_data", get_mem(get_output_layout(p), 0, 255)),
eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
reorder("reorder_bfyx", "sum", p.default_format, data_types::f32)
quantize("quantize1", "activation", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 256, p.out_type),
eltwise("sum", { "quantize1", "eltwise_data" }, eltwise_mode::sum, p.out_type), // eltwise sum with full tensor
eltwise("sum1", { "sum", "eltwise_data1" }, eltwise_mode::sum, p.out_type), // eltwise sum with full tensor
eltwise("sum2", { "sum1", "eltwise_data2" }, eltwise_mode::sum, p.out_type), // eltwise sum with broadcasting
reorder("reorder_bfyx", "sum2", p.default_format, p.default_type)
);
tolerance = 1.f;
execute(p);
}
#define CASE_CONV_F16F16_FT_ELTW_SUM_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, tensor{ 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx, data_types::f32, format::bfyx
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_sum_full_tensor, ::testing::ValuesIn(std::vector<convolution_test_params>{
// cases with batch = 1
convolution_test_params{ CASE_CONV_F16F16_FT_ELTW_SUM_1, 2, 4 },
INSTANTIATE_TEST_SUITE_P(multiple_eltwise_sum_fusings_gpu, onednn_multiple_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_eltw_sum_test_params>{
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_BINARY_ADD_1, 2, 7 },
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_SUM_1, 2, 7 },
}));
#endif // ENABLE_ONEDNN_FOR_GPU

View File

@@ -227,4 +227,10 @@ public:
topology_fused.add(args...);
topology_non_fused.add(args...);
}
template <class... Args>
void add_topologies(Args const&... args) {
topology_fused.add(args...);
topology_non_fused.add(args...);
}
};