[GPU] Code refactoring to choose between binary_add and sum (#10724)
+ Fix colorization-sig accuracy issue using oneDNN Memory crash in case reuse_eltwise_sum_post in oneDNN and memory_pool And print node in/out gpu_usm_mem addr at OV_GPU_Verbose >= 1 + Check the size of z spatial axis for checking fulltensor. + Remove program_helpers's functions. Co-authored-by: hyunback <hyunback.kim@intel.com>
This commit is contained in:
@@ -34,6 +34,8 @@ struct memory {
|
||||
virtual void unlock(const stream& stream) = 0;
|
||||
virtual event::ptr fill(stream& stream, unsigned char pattern) = 0;
|
||||
virtual event::ptr fill(stream& stream) = 0;
|
||||
// only supports gpu_usm
|
||||
virtual void* buffer_ptr() const { return nullptr; }
|
||||
|
||||
size_t size() const { return _bytes_count; }
|
||||
size_t count() const { return _layout.count(); }
|
||||
|
||||
@@ -36,39 +36,26 @@ void basic_memory_dependencies::run(program& p) {
|
||||
add_memory_dependency(it, node);
|
||||
}
|
||||
|
||||
if (node->is_type<convolution>() && node->get_preferred_impl_type() == impl_types::onednn) {
|
||||
auto& conv = node->as<convolution>();
|
||||
bool can_reuse_eltwise_mem = false;
|
||||
if (node->get_preferred_impl_type() == impl_types::onednn
|
||||
&& (node->is_type<convolution>() || node->is_type<deconvolution>())) {
|
||||
size_t eltw_dep = 0;
|
||||
|
||||
for (auto& fused_op : conv.get_fused_primitives()) {
|
||||
for (auto& fused_op : node->get_fused_primitives()) {
|
||||
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
|
||||
auto eltw_in_layout = conv.get_dependency(fused_op.dep_start_idx).get_output_layout();
|
||||
auto conv_out_layout = node->get_output_layout();
|
||||
if (eltw_dep > 0) {
|
||||
can_reuse_eltwise_mem = false;
|
||||
break;
|
||||
}
|
||||
// If it is first sum, reuse the buffer
|
||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
|
||||
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
|
||||
continue;
|
||||
|
||||
if (eltw_in_layout.size == conv_out_layout.size &&
|
||||
eltw_in_layout.format == conv_out_layout.format &&
|
||||
eltw_in_layout.data_padding == conv_out_layout.data_padding &&
|
||||
data_type_traits::size_of(eltw_in_layout.data_type) == data_type_traits::size_of(conv_out_layout.data_type)) {
|
||||
eltw_dep = fused_op.dep_start_idx;
|
||||
can_reuse_eltwise_mem = true;
|
||||
eltw_dep = fused_op.dep_start_idx;
|
||||
auto& eltw_node = node->get_dependency(eltw_dep);
|
||||
eltw_node.can_share_buffer(false);
|
||||
node->can_share_buffer(false);
|
||||
for (auto& user : node->get_users()) {
|
||||
add_memory_dependency(user, &eltw_node);
|
||||
add_memory_dependency(user, node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (can_reuse_eltwise_mem) {
|
||||
auto& eltw_node = conv.get_dependency(eltw_dep);
|
||||
eltw_node.can_share_buffer(false);
|
||||
conv.can_share_buffer(false);
|
||||
for (auto& user : conv.get_users()) {
|
||||
add_memory_dependency(user, &eltw_node);
|
||||
add_memory_dependency(user, &conv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Note we iterate over processing order, it means if primitve has processing num greater than any of outputs,
|
||||
|
||||
@@ -93,16 +93,11 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
|
||||
for (auto& input : node.get_dependencies()) {
|
||||
if (input->get_preferred_impl_type() == impl_types::onednn) {
|
||||
for (auto& fused_op : input->get_fused_primitives()) {
|
||||
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
|
||||
auto& eltw_in = input->get_dependency(fused_op.dep_start_idx);
|
||||
auto eltw_in_layout = eltw_in.get_output_layout();
|
||||
auto out_layout = input->get_output_layout();
|
||||
|
||||
if (!program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout))
|
||||
continue;
|
||||
if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout))
|
||||
return false;
|
||||
}
|
||||
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(*input, fused_op);
|
||||
if (add_type == add_fusing_type::sum)
|
||||
return false;
|
||||
else
|
||||
continue;
|
||||
}
|
||||
is_onednn_impl = true;
|
||||
}
|
||||
|
||||
@@ -649,44 +649,38 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
// When the conv node is of onednn impl type and eltwise sum with full tensor is fused,
|
||||
// changes the input format of eltwise sum post-op to use binary add.
|
||||
if (conv_node.get_preferred_impl_type() == impl_types::onednn) {
|
||||
std::vector<size_t> eltw_sum_dep_indices;
|
||||
for (size_t i = 1; i < conv_node.get_dependencies().size(); i++) {
|
||||
auto& dep = conv_node.get_dependency(i);
|
||||
for (auto& fused_op : conv_node.get_fused_primitives()) {
|
||||
if (fused_op.node->is_type<eltwise>()
|
||||
&& fused_op.node->as<eltwise>().get_primitive()->mode == eltwise_mode::sum
|
||||
&& !program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(),
|
||||
conv_node.get_dependency(fused_op.dep_start_idx).get_output_layout())
|
||||
&& conv_node.get_dependency(fused_op.dep_start_idx).get_users().size() == 1
|
||||
&& conv_node.get_dependency(fused_op.dep_start_idx).id() == dep.id()) {
|
||||
eltw_sum_dep_indices.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
onednn_add_fusing_helpers::for_eltwise(conv_node, eltwise_mode::sum,
|
||||
[&](const program_node& p_node, const eltwise_node& e_node, const fused_primitive_desc& desc) {
|
||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(p_node, desc);
|
||||
if (fusing_type == add_fusing_type::binary_per_tensor) {
|
||||
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
|
||||
auto d_layout = dep_node.get_output_layout();
|
||||
auto d_format = d_layout.format;
|
||||
auto expected_format = format::any;
|
||||
|
||||
auto conv_layout = conv_node.get_output_layout();
|
||||
for (auto& dep_id : eltw_sum_dep_indices) {
|
||||
auto& prev_node = conv_node.get_dependency(dep_id);
|
||||
auto old_layout = prev_node.get_output_layout();
|
||||
auto expected_format = format::any;
|
||||
if ((conv_layout.data_type == data_types::f16 || conv_layout.data_type == data_types::f32)
|
||||
&& data_type_traits::is_i8_u8(old_layout.data_type)) {
|
||||
if (conv_layout.format == format::b_fs_yx_fsv16)
|
||||
expected_format = format::b_fs_yx_fsv32;
|
||||
if (conv_layout.format == format::bs_fs_yx_bsv32_fsv16)
|
||||
expected_format = format::bs_fs_yx_bsv32_fsv32;
|
||||
}
|
||||
if (data_type_traits::is_i8_u8(d_layout.data_type)) {
|
||||
if (d_format == format::b_fs_yx_fsv16)
|
||||
expected_format = format::b_fs_yx_fsv32;
|
||||
else if (d_format == format::bs_fs_yx_bsv32_fsv16)
|
||||
expected_format = format::bs_fs_yx_bsv32_fsv32;
|
||||
} else if (data_type_traits::is_floating_point(d_layout.data_type)) {
|
||||
if (d_format == format::b_fs_yx_fsv32)
|
||||
expected_format = format::b_fs_yx_fsv16;
|
||||
else if (d_format == format::bs_fs_yx_bsv32_fsv32)
|
||||
expected_format = format::bs_fs_yx_bsv32_fsv16;
|
||||
}
|
||||
|
||||
if (expected_format != format::any && old_layout.format != expected_format) {
|
||||
auto new_layout = old_layout;
|
||||
new_layout.format = expected_format;
|
||||
auto new_input = rf.get_reorder(prev_node.id(), old_layout, new_layout);
|
||||
if (new_input.first) {
|
||||
p.add_intermediate(new_input.first, conv_node, dep_id, !new_input.second);
|
||||
if (expected_format != format::any && d_layout.format != expected_format) {
|
||||
auto new_layout = d_layout;
|
||||
new_layout.format = expected_format;
|
||||
auto new_input = rf.get_reorder(dep_node.id(), d_layout, new_layout);
|
||||
if (new_input.first) {
|
||||
p.add_intermediate(new_input.first, conv_node, desc.dep_start_idx, !new_input.second);
|
||||
}
|
||||
conv_node.get_dependency(desc.dep_start_idx).set_output_layout(new_layout, false);
|
||||
}
|
||||
}
|
||||
conv_node.get_dependency(dep_id).set_output_layout(new_layout, false);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -11,10 +11,12 @@
|
||||
#include "intel_gpu/graph/program.hpp"
|
||||
#include "data_inst.h"
|
||||
#include "eltwise_inst.h"
|
||||
#include "convolution_inst.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <iostream>
|
||||
|
||||
namespace cldnn {
|
||||
struct program_helpers {
|
||||
@@ -125,12 +127,35 @@ struct program_helpers {
|
||||
}
|
||||
}
|
||||
static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
|
||||
|
||||
static bool are_layouts_identical_for_onednn_sum_post_op(layout input_layout, layout output_layout);
|
||||
|
||||
static bool needs_onednn_sum_post_op(const eltwise_node& n, layout input_layout);
|
||||
};
|
||||
|
||||
struct onednn_add_fusing_helpers {
|
||||
enum class add_fusing_type {
|
||||
sum,
|
||||
binary_per_tensor,
|
||||
binary_per_oc,
|
||||
not_supported,
|
||||
};
|
||||
|
||||
static bool is_full_tensor(const layout& layout);
|
||||
static std::vector<fused_primitive_desc> get_fused_eltwise_primitives();
|
||||
static void for_eltwise(const program_node& conv_node, eltwise_mode mode,
|
||||
std::function<void(const program_node&, const eltwise_node&, const fused_primitive_desc&)> func);
|
||||
static add_fusing_type get_add_fusing_type(const program_node& node, const fused_primitive_desc& desc);
|
||||
};
|
||||
|
||||
using add_fusing_type = onednn_add_fusing_helpers::add_fusing_type;
|
||||
|
||||
static inline std::ostream& operator<< (std::ostream& os, add_fusing_type& t) {
|
||||
switch (t) {
|
||||
case add_fusing_type::sum: os << "sum"; break;
|
||||
case add_fusing_type::binary_per_tensor: os << "binary_per_tensor"; break;
|
||||
case add_fusing_type::binary_per_oc: os << "binary_per_oc"; break;
|
||||
default: os << "not_supported"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
// Base class for performing pattern match style optimizations.
|
||||
// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`,
|
||||
// and overload match and optimize methods.
|
||||
|
||||
@@ -56,6 +56,30 @@ enum class onednn_post_op_type : uint32_t {
|
||||
optimized_sum
|
||||
};
|
||||
|
||||
static inline std::ostream& operator<< (std::ostream& os, onednn_post_op_type& t) {
|
||||
switch (t) {
|
||||
case onednn_post_op_type::eltwise_act: os << "eltwise_act"; break;
|
||||
case onednn_post_op_type::eltwise_clip: os << "eltwise_clip"; break;
|
||||
case onednn_post_op_type::eltwise_linear: os << "eltwise_linear"; break;
|
||||
case onednn_post_op_type::eltwise_round: os << "eltwise_round"; break;
|
||||
case onednn_post_op_type::binary_mul: os << "binary_mul"; break;
|
||||
case onednn_post_op_type::binary_add: os << "binary_add"; break;
|
||||
case onednn_post_op_type::binary_max: os << "binary_max"; break;
|
||||
case onednn_post_op_type::binary_min: os << "binary_min"; break;
|
||||
case onednn_post_op_type::binary_relu: os << "binary_relu"; break;
|
||||
case onednn_post_op_type::scale: os << "scale"; break;
|
||||
case onednn_post_op_type::sum: os << "sum"; break;
|
||||
case onednn_post_op_type::optimized: os << "optimized"; break;
|
||||
case onednn_post_op_type::optimized_eltwise_act: os << "optimized_eltwise_act"; break;
|
||||
case onednn_post_op_type::optimized_eltwise_clip: os << "optimized_eltwise_clip"; break;
|
||||
case onednn_post_op_type::optimized_eltwise_linear: os << "optimized_eltwise_linear"; break;
|
||||
case onednn_post_op_type::optimized_eltwise_round: os << "optimized_eltwise_round"; break;
|
||||
case onednn_post_op_type::optimized_sum: os << "optimized_sum"; break;
|
||||
default: os << "invalid";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
struct fused_primitive_desc_onednn {
|
||||
onednn_post_op_type op_type; // onednn post-operation type
|
||||
size_t mem_offset; // index of a memory buffer for current post-operation
|
||||
|
||||
@@ -198,12 +198,11 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
// Not to fuse reorder if this removal changes input format of its next node which has reuse in fused_op
|
||||
if (next.get_preferred_impl_type() == impl_types::onednn) {
|
||||
for (auto& fused_op : next.get_fused_primitives()) {
|
||||
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
|
||||
if (fused_op.node->is_type<eltwise>()) {
|
||||
auto eltw_in_layout = next.get_dependency(fused_op.dep_start_idx).get_output_layout();
|
||||
auto out_layout = next.get_output_layout();
|
||||
if (program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout) &&
|
||||
program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout) &&
|
||||
prev.get_output_layout().format != out_layout.format)
|
||||
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(next, fused_op);
|
||||
if (add_type == add_fusing_type::sum && prev.get_output_layout().format != out_layout.format)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -947,23 +946,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
bool use_onednn_impls = _optimization_attributes.use_onednn_impls && input_layout.data_type != data_types::f32;
|
||||
bool i8_u8_input = input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8;
|
||||
|
||||
if (use_onednn_impls && onednn_valid_post_ops) {
|
||||
for (auto& fo : node.get_fused_primitives()) {
|
||||
if (fo.node->is_type<eltwise>()) {
|
||||
auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
|
||||
auto out_layout = node.get_output_layout();
|
||||
auto in_dt = in_layout.data_type;
|
||||
auto out_dt = out_layout.data_type;
|
||||
if ((out_layout.count() == in_layout.count()) &&
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||
program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
|
||||
onednn_valid_post_ops = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (use_onednn_impls && onednn_valid_post_ops) {
|
||||
std::function<bool(const program_node&)> has_any_convolutions_below;
|
||||
has_any_convolutions_below = [&](const program_node& node) -> bool {
|
||||
@@ -1373,23 +1355,6 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
impl_candidate = impl_types::ocl;
|
||||
}
|
||||
|
||||
// [WA] to avoid an onednn kernel issue of multiple sum post-ops
|
||||
if (!node.get_fused_primitives().empty()) {
|
||||
size_t sum_post_op_cnt = 0;
|
||||
for (auto& fused_op : node.get_fused_primitives()) {
|
||||
if (fused_op.node->is_type<eltwise>() && node.get_dependencies().size() > fused_op.dep_start_idx && fused_op.deps.size() == 1) {
|
||||
auto& eltw_in = node.get_dependency(fused_op.dep_start_idx);
|
||||
if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in.get_output_layout(), node.get_output_layout()) &&
|
||||
program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in.get_output_layout())) {
|
||||
if (sum_post_op_cnt > 0)
|
||||
return impl_types::ocl;
|
||||
|
||||
sum_post_op_cnt += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (node.is_type<convolution>()) {
|
||||
// oneDNN doesn't have good support for groups with fsv16 fmt
|
||||
auto& conv = node.as<convolution>();
|
||||
@@ -1418,29 +1383,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
impl_candidate = impl_types::ocl;
|
||||
}
|
||||
|
||||
size_t eltw_dep = 0;
|
||||
for (auto& fo : node.get_fused_primitives()) {
|
||||
if (fo.node->is_type<eltwise>()) {
|
||||
auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
|
||||
auto out_layout = node.get_output_layout();
|
||||
auto in_dt = in_layout.data_type;
|
||||
auto out_dt = out_layout.data_type;
|
||||
if (program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
|
||||
if ((out_layout.count() == in_layout.count()) &&
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
}
|
||||
if (in_layout.size == out_layout.size && in_layout.format == out_layout.format && in_layout.data_padding == out_layout.data_padding &&
|
||||
data_type_traits::size_of(in_dt) == data_type_traits::size_of(out_dt)) {
|
||||
if (eltw_dep > 0) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
}
|
||||
eltw_dep = fo.dep_start_idx;
|
||||
}
|
||||
}
|
||||
} else if (fo.node->is_type<activation>()) {
|
||||
if (fo.node->is_type<activation>()) {
|
||||
// Some activations aren't implemented in oneDNN
|
||||
auto activation_prim = fo.node->as<activation>().get_primitive();
|
||||
if (activation_prim->activation_function == activation_func::negative ||
|
||||
@@ -1486,15 +1430,17 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
auto out_layout = node.get_output_layout();
|
||||
auto in_dt = in_layout.data_type;
|
||||
auto out_dt = out_layout.data_type;
|
||||
if ((out_layout.count() == in_layout.count()) &&
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||
program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
|
||||
// if it is not eltwise sum and input is full tensor
|
||||
if ((out_layout.count() == in_layout.count()) && in_dt != out_dt
|
||||
&& (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt))
|
||||
&& onednn_add_fusing_helpers::is_full_tensor(in_layout)) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
}
|
||||
|
||||
if (fo.node->as<eltwise>().get_primitive()->mode == eltwise_mode::sum &&
|
||||
program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
|
||||
// WA: onednn sum/binary_add post-op are not supported due to perf drop.
|
||||
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(node, fo);
|
||||
if (add_type == add_fusing_type::sum || add_type == add_fusing_type::binary_per_tensor || add_type == add_fusing_type::binary_per_oc) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -536,54 +536,22 @@ void network::allocate_primitives() {
|
||||
|
||||
for (auto const& node : _program->get_processing_order()) {
|
||||
if (node->get_preferred_impl_type() == impl_types::onednn) {
|
||||
bool can_reuse_eltwise_mem = false;
|
||||
size_t eltw_dep = 0;
|
||||
|
||||
for (auto& fused_op : node->get_fused_primitives()) {
|
||||
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
|
||||
auto& eltw_in = node->get_dependency(fused_op.dep_start_idx);
|
||||
auto eltw_in_layout = eltw_in.get_output_layout();
|
||||
auto out_layout = node->get_output_layout();
|
||||
|
||||
if (!program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout))
|
||||
// If it is first sum, reuse the buffer
|
||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
|
||||
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
|
||||
continue;
|
||||
|
||||
if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout)) {
|
||||
if (eltw_dep > 0)
|
||||
throw std::runtime_error("Unsupported multiple full size tensors.");
|
||||
|
||||
eltw_dep = fused_op.dep_start_idx;
|
||||
can_reuse_eltwise_mem = true;
|
||||
eltw_dep = fused_op.dep_start_idx;
|
||||
auto& eltw_in = node->get_dependency(eltw_dep);
|
||||
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
|
||||
auto& eltw_inst = _primitives.at(eltw_in.id());
|
||||
auto& prim_inst = _primitives.at(node->id());
|
||||
auto& eltw_mem = eltw_inst->output_memory();
|
||||
auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
|
||||
prim_inst->set_output_memory(new_mem);
|
||||
}
|
||||
|
||||
if (!can_reuse_eltwise_mem) {
|
||||
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
|
||||
auto& eltw_inst = _primitives.at(eltw_in.id());
|
||||
auto& prim_inst = _primitives.at(node->id());
|
||||
auto eltw_mem_type = eltw_inst->output_memory().get_allocation_type();
|
||||
auto prim_mem_type = prim_inst->output_memory().get_allocation_type();
|
||||
|
||||
// Keep lockable memory type for `prim_inst` output if needed
|
||||
if (eltw_mem_type != prim_mem_type && eltw_mem_type != allocation_type::cl_mem && eltw_mem_type != allocation_type::usm_host)
|
||||
can_reuse_eltwise_mem = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout) && !can_reuse_eltwise_mem) {
|
||||
throw std::runtime_error("Buffer reuse is required for onednn sum post operation.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (can_reuse_eltwise_mem) {
|
||||
auto& eltw_in = node->get_dependency(eltw_dep);
|
||||
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
|
||||
auto& eltw_inst = _primitives.at(eltw_in.id());
|
||||
auto& prim_inst = _primitives.at(node->id());
|
||||
auto& eltw_mem = eltw_inst->output_memory();
|
||||
auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
|
||||
|
||||
prim_inst->set_output_memory(new_mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -698,8 +666,21 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
std::ostringstream in_addr;
|
||||
// buffer_ptr() only support usm_memory
|
||||
for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
|
||||
auto& in_mem = get_primitive(inst->id())->dep_memory(i);
|
||||
in_addr << in_mem.buffer_ptr();
|
||||
if (i < get_primitive(inst->id())->dependencies().size() - 1) {
|
||||
in_addr << ", ";
|
||||
}
|
||||
}
|
||||
auto& out_mem = get_primitive(inst->id())->output_memory();
|
||||
|
||||
GPU_DEBUG_COUT << "Execute " << inst->id() << ", memory type: "
|
||||
<< inst->output_memory().get_allocation_type() << std::endl;
|
||||
<< inst->output_memory().get_allocation_type() << ", in_usm("
|
||||
<< in_addr.str() << "), out_usm("
|
||||
<< out_mem.buffer_ptr() << ")" << std::endl;
|
||||
}
|
||||
|
||||
// If a node has mutable input or it's an output, then the input/output buffers might be changed
|
||||
|
||||
@@ -7,9 +7,11 @@
|
||||
#include "program_helpers.h"
|
||||
#include "intel_gpu/graph/program.hpp"
|
||||
#include "data_inst.h"
|
||||
#include "pooling_inst.h"
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
namespace cldnn {
|
||||
// helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization
|
||||
@@ -181,28 +183,52 @@ std::pair<bool, bool> program_helpers::are_layouts_identical(layout const& l1, l
|
||||
return {false, false};
|
||||
}
|
||||
|
||||
// check if input and output layouts are identical to reuse memory in fused_ops of onednn
|
||||
bool program_helpers::are_layouts_identical_for_onednn_sum_post_op(layout input_layout, layout output_layout) {
|
||||
if (input_layout.size == output_layout.size && input_layout.format == output_layout.format &&
|
||||
input_layout.data_padding == output_layout.data_padding &&
|
||||
data_type_traits::size_of(input_layout.data_type) == data_type_traits::size_of(output_layout.data_type))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool program_helpers::needs_onednn_sum_post_op(const eltwise_node& n, layout input_layout) {
|
||||
auto output_layout = n.get_output_layout();
|
||||
if (n.get_primitive()->mode == eltwise_mode::sum &&
|
||||
(input_layout.size.spatial[0] > 1 || input_layout.size.spatial[1] > 1 || input_layout.size.batch[0] > 1)
|
||||
&& output_layout.data_type == input_layout.data_type) {
|
||||
bool onednn_add_fusing_helpers::is_full_tensor(const layout& l) {
|
||||
if (l.size.spatial[0] > 1 || l.size.spatial[1] > 1 || (l.get_spatial_rank() == 3 && l.size.spatial[2] > 1)
|
||||
|| l.size.batch[0] > 1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void onednn_add_fusing_helpers::for_eltwise(
|
||||
const program_node& node, eltwise_mode mode,
|
||||
std::function<void(const program_node& p_node, const eltwise_node& e_node,
|
||||
const fused_primitive_desc& desc)> func) {
|
||||
for (auto& fo : node.get_fused_primitives()) {
|
||||
if (fo.node->is_type<eltwise>() && fo.node->as<eltwise>().get_primitive()->mode == mode) {
|
||||
func(node, fo.node->as<eltwise>(), fo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type(
|
||||
const program_node& p_node, const fused_primitive_desc& desc) {
|
||||
if (!desc.node->is_type<eltwise>() || desc.node->as<eltwise>().get_primitive()->mode != eltwise_mode::sum) {
|
||||
return add_fusing_type::not_supported;
|
||||
}
|
||||
|
||||
auto& eltw_node = desc.node->as<eltwise>();
|
||||
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
|
||||
|
||||
auto p_layout = p_node.get_output_layout();
|
||||
auto e_layout = eltw_node.get_output_layout();
|
||||
auto d_layout = dep_node.get_output_layout();
|
||||
|
||||
if (is_full_tensor(p_layout) && is_full_tensor(d_layout)) {
|
||||
if (data_type_traits::size_of(p_layout.data_type) == data_type_traits::size_of(d_layout.data_type)
|
||||
&& p_layout.format == d_layout.format && p_layout.size == d_layout.size
|
||||
&& p_layout.data_padding == d_layout.data_padding
|
||||
&& dep_node.get_users().size() == 1
|
||||
&& !p_node.is_type<pooling>()) {
|
||||
return add_fusing_type::sum;
|
||||
} else if (p_layout.size == d_layout.size) {
|
||||
return add_fusing_type::binary_per_tensor;
|
||||
}
|
||||
}
|
||||
|
||||
return add_fusing_type::binary_per_oc;
|
||||
}
|
||||
|
||||
|
||||
} // namespace cldnn
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "primitive_inst.h"
|
||||
#include "loop_inst.h"
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include "convolution_inst.h"
|
||||
#include "quantize_inst.h"
|
||||
#include "reorder_inst.h"
|
||||
@@ -345,6 +346,8 @@ bool program_node::has_out_scales(const std::shared_ptr<dnnl::primitive_attr>& a
|
||||
|
||||
dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr<dnnl::primitive_attr>& attr,
|
||||
bool& optimization_is_completed) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
|
||||
// Create new dnnl::post_ops object which will be filled inside the optimization process
|
||||
dnnl::post_ops optimized_p_ops;
|
||||
|
||||
@@ -393,6 +396,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
float scale;
|
||||
dnnl::memory::data_type data_type;
|
||||
cur_p_ops.get_params_sum(idx, scale, data_type);
|
||||
// Only conv supports data type specification in append_sum. Other primitives(deconv, fc) do not support it.
|
||||
if (is_type<convolution>()) {
|
||||
new_p_ops.append_sum(scale, data_type);
|
||||
} else {
|
||||
@@ -419,7 +423,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
|
||||
// Check that post-op type is any optimized
|
||||
auto type_is_any_optimized = [](onednn_post_op_type type) -> bool {
|
||||
return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum ||
|
||||
return type == onednn_post_op_type::optimized ||
|
||||
type == onednn_post_op_type::optimized_sum ||
|
||||
type == onednn_post_op_type::optimized_eltwise_act ||
|
||||
type == onednn_post_op_type::optimized_eltwise_linear ||
|
||||
type == onednn_post_op_type::optimized_eltwise_clip ||
|
||||
@@ -462,20 +467,45 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
}
|
||||
};
|
||||
|
||||
auto remove_optimized_prefix = [&](std::vector<fused_primitive_desc_onednn>& post_ops) {
|
||||
// Check and update post-op map if we already optimized something
|
||||
auto iter = post_ops.begin();
|
||||
while (iter != post_ops.end()) {
|
||||
if (type_is_optimized_sum(iter->op_type)) {
|
||||
iter->op_type = onednn_post_op_type::sum;
|
||||
++iter;
|
||||
} else if (type_is_optimized_eltwise(iter->op_type)) {
|
||||
iter->op_type = get_eltwise_type(iter->op_type);
|
||||
++iter;
|
||||
} else if (type_is_optimized(iter->op_type)) {
|
||||
iter = post_ops.erase(iter);
|
||||
} else {
|
||||
++iter;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto& cur_post_ops = get_fused_primitives_onednn();
|
||||
|
||||
size_t cur_post_op_idx = 1;
|
||||
size_t prev_post_op_idx = 0;
|
||||
bool optimization_done = false;
|
||||
|
||||
// Check and update post-op map if we already optimized something
|
||||
for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) {
|
||||
if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type))
|
||||
cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum;
|
||||
else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type))
|
||||
cur_post_ops[post_op_idx].op_type = get_eltwise_type(cur_post_ops[post_op_idx].op_type);
|
||||
else if (type_is_optimized(cur_post_ops[post_op_idx].op_type))
|
||||
cur_post_ops.erase(cur_post_ops.begin() + post_op_idx);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
GPU_DEBUG_COUT << "================================================" << std::endl;
|
||||
GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
}
|
||||
|
||||
remove_optimized_prefix(cur_post_ops);
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
GPU_DEBUG_COUT << "remove optimized prefix ------------------------" << std::endl;
|
||||
GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
GPU_DEBUG_COUT << "----------------------------------->>>>>>>>>>>>>" << std::endl;
|
||||
}
|
||||
|
||||
// Get post-ops size for current node
|
||||
@@ -498,6 +528,9 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
auto cur_type = cur_post_ops[cur_post_op_idx].op_type;
|
||||
auto prev_type = cur_post_ops[prev_post_op_idx].op_type;
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3)
|
||||
GPU_DEBUG_COUT << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
|
||||
|
||||
// Ignore optimized operations for "previous" operation in our operation pair
|
||||
while (type_is_any_optimized(prev_type) && prev_post_op_idx < post_ops_size - 1) {
|
||||
prev_post_op_idx++;
|
||||
@@ -513,9 +546,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
cur_type = cur_post_ops[cur_post_op_idx].op_type;
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3)
|
||||
GPU_DEBUG_COUT << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
|
||||
|
||||
auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
|
||||
auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
|
||||
|
||||
// if 2 indices are same, add the last post-op to dnnl::post_ops
|
||||
if (prev_idx == post_ops_size - 1 && prev_idx == cur_idx && !type_is_any_optimized(prev_type)) {
|
||||
add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx);
|
||||
break;
|
||||
}
|
||||
|
||||
// If this is the last pair and it's optimized - add the last post-op and go out from the cycle
|
||||
if (cur_post_op_idx == post_ops_size - 1 && (type_is_any_optimized(cur_type) || type_is_any_optimized(prev_type))) {
|
||||
if (!type_is_any_optimized(prev_type)) {
|
||||
@@ -542,6 +584,11 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
|
||||
bool cur_ops_pair_is_optimized = false;
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
GPU_DEBUG_COUT << "prev_idx: " << prev_idx << " " << prev_type
|
||||
<< ", cur_idx: " << cur_idx << " " << cur_type << std::endl;
|
||||
}
|
||||
|
||||
if (can_try_optimize) {
|
||||
if (eltw_and_eltw) {
|
||||
dnnl::algorithm cur_alg, prev_alg;
|
||||
@@ -701,6 +748,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
dnnl::post_ops eltw_p_op_prev, sum_p_op;
|
||||
|
||||
eltw_p_op_prev.append_eltwise(eltw_scale * next_alpha * next_scale, alg, alpha, beta);
|
||||
// Only conv supports data type specification in append_sum. Other primitives(deconv, fc) do not support it.
|
||||
if (is_type<convolution>()) {
|
||||
sum_p_op.append_sum(sum_scale * next_alpha, data_type);
|
||||
} else {
|
||||
@@ -769,7 +817,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
}
|
||||
}
|
||||
|
||||
// if optimization_is_completed is true, try to optimize again.
|
||||
optimization_is_completed = !optimization_is_completed;
|
||||
if (optimization_is_completed) {
|
||||
remove_optimized_prefix(cur_post_ops);
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
GPU_DEBUG_COUT << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
GPU_DEBUG_COUT << "------------------------------------------------" << std::endl;
|
||||
}
|
||||
|
||||
add_onednn_fused_primitives(cur_post_ops);
|
||||
|
||||
@@ -805,6 +864,7 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
memory_offset++;
|
||||
};
|
||||
|
||||
int32_t num_sum_post_ops = 0;
|
||||
for (size_t idx = 0; idx < cldnn_post_ops.size(); idx++) {
|
||||
auto node = cldnn_post_ops[idx].node;
|
||||
|
||||
@@ -834,13 +894,11 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
auto in = get_dependency(dep_idx).get_output_layout();
|
||||
|
||||
if (e_node.get_primitive()->mode == eltwise_mode::sum) {
|
||||
if (program_helpers::needs_onednn_sum_post_op(e_node, in)) {
|
||||
if (is_type<convolution>()) {
|
||||
post_ops.append_sum(1.0f, onednn::convert_data_type(in.data_type));
|
||||
} else {
|
||||
post_ops.append_sum(1.0f);
|
||||
}
|
||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*this, cldnn_post_ops[idx]);
|
||||
if (fusing_type == add_fusing_type::sum && num_sum_post_ops == 0) {
|
||||
post_ops.append_sum(1.0f);
|
||||
update_onednn_post_op_list(onednn_post_op_type::sum, dep_idx);
|
||||
num_sum_post_ops++;
|
||||
} else {
|
||||
dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in);
|
||||
post_ops.append_binary(dnnl::algorithm::binary_add, in_desc);
|
||||
|
||||
@@ -107,6 +107,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
|
||||
void unlock(const stream& stream) override;
|
||||
const cl::UsmMemory& get_buffer() const { return _buffer; }
|
||||
cl::UsmMemory& get_buffer() { return _buffer; }
|
||||
void* buffer_ptr() const override { return _buffer.get(); }
|
||||
|
||||
event::ptr fill(stream& stream, unsigned char pattern) override;
|
||||
event::ptr fill(stream& stream) override;
|
||||
|
||||
@@ -3410,68 +3410,130 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range, ::test
|
||||
convolution_test_params{ CASE_CONV_S8S8_15, 2, 3 },
|
||||
}));
|
||||
|
||||
struct convolution_eltw_sum_test_params {
|
||||
tensor in_shape;
|
||||
tensor out_shape;
|
||||
tensor kernel;
|
||||
ov::Strides stride;
|
||||
ov::CoordinateDiff pad;
|
||||
ov::Strides dilation;
|
||||
uint32_t groups;
|
||||
data_types data_type;
|
||||
format input_format;
|
||||
data_types weights_type;
|
||||
format weights_format;
|
||||
data_types eltw_type;
|
||||
format eltw_format;
|
||||
data_types out_type;
|
||||
format out_format;
|
||||
data_types default_type;
|
||||
format default_format;
|
||||
size_t expected_fused_primitives;
|
||||
size_t expected_not_fused_primitives;
|
||||
};
|
||||
|
||||
// input:b_fs_yx_fsv32:u8 X weight:bfyx:i8 + eltwise_sum:b_fs_yx_fsv32:u8
|
||||
// After optimization: eltwise_any + binary_add
|
||||
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_tanh+binary_add:u8:14:aBcd32b+eltwise_linear:1
|
||||
class post_ops_optimizations_onednn_binary_add_full_tensor : public WeightsPrimitiveFusingTestOneDNN {};
|
||||
TEST_P(post_ops_optimizations_onednn_binary_add_full_tensor, basic) {
|
||||
class EltwiseSumFusingTestOneDNN : public BaseFusingTest<convolution_eltw_sum_test_params> {
|
||||
public:
|
||||
void execute(convolution_eltw_sum_test_params& p) {
|
||||
auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p));
|
||||
|
||||
network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
|
||||
network network_fused(this->engine, this->topology_fused, bo_fused);
|
||||
network_fused.set_input_data("input", input_prim);
|
||||
network_not_fused.set_input_data("input", input_prim);
|
||||
|
||||
compare(network_not_fused, network_fused, p);
|
||||
|
||||
auto pi_fused = network_fused.get_primitives_info();
|
||||
auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), [](primitive_info& p) -> bool {
|
||||
if (p.original_id == "conv_prim")
|
||||
return true;
|
||||
return false;
|
||||
});
|
||||
|
||||
if (info_fused != pi_fused.end() && engine.get_device_info().supports_immad) {
|
||||
std::cout << "kernel: " << info_fused->kernel_id << std::endl;
|
||||
EXPECT_TRUE(info_fused->kernel_id.find("jit:ir") != std::string::npos);
|
||||
}
|
||||
}
|
||||
|
||||
layout get_input_layout(convolution_eltw_sum_test_params& p) {
|
||||
auto pad = p.pad;
|
||||
std::vector<int> pad_ = { 0, 0, static_cast<int>(pad[0]), static_cast<int>(pad[1]) };
|
||||
return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
|
||||
}
|
||||
|
||||
layout get_per_channel_layout(convolution_eltw_sum_test_params& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } };
|
||||
}
|
||||
};
|
||||
|
||||
class onednn_binary_add_full_tensor : public EltwiseSumFusingTestOneDNN {};
|
||||
TEST_P(onednn_binary_add_full_tensor, basic) {
|
||||
auto p = GetParam();
|
||||
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("in_lo", get_mem(get_single_element_layout(p), 0)),
|
||||
data("in_hi", get_mem(get_single_element_layout(p), 255)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), 0)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 255)),
|
||||
data("eltwise_data", get_mem(get_output_layout(p), 0, 255)),
|
||||
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||
data("in_lo1", get_mem(get_single_element_layout(p), 0)),
|
||||
data("in_hi1", get_mem(get_single_element_layout(p), 100)),
|
||||
data("out_lo1", get_mem(get_single_element_layout(p), 0)),
|
||||
data("out_hi1", get_mem(get_single_element_layout(p), 100)),
|
||||
data("eltwise_data", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
|
||||
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, false),
|
||||
activation("activation", "conv_prim", activation_func::hyperbolic_tan),
|
||||
eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
|
||||
quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8),
|
||||
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||
quantize("quantize1", "activation", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 256, p.out_type),
|
||||
eltwise("sum", { "quantize1", "eltwise_data" }, eltwise_mode::sum, p.out_type),
|
||||
reorder("reorder_bfyx", "sum", p.default_format, p.default_type)
|
||||
);
|
||||
|
||||
tolerance = 1.f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format;
|
||||
#define CASE_CONV_U8S8_FT_BINARY_ADD_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, tensor{ 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::f32, format::bfyx
|
||||
// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; eltw_type; eltw_format; out_type; out_format; default_type; default_format;
|
||||
#define CASE_CONV_ELTW_SUM_BINARY_ADD_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, { 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::u8, format::b_fs_yx_fsv32, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_ELTW_SUM_SUM_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, { 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::u8, format::b_fs_yx_fsv32, data_types::u8, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_test_params>{
|
||||
// cases with batch = 1
|
||||
convolution_test_params{ CASE_CONV_U8S8_FT_BINARY_ADD_1, 2, 5 },
|
||||
INSTANTIATE_TEST_SUITE_P(eltwise_sum_fusings_gpu, onednn_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_eltw_sum_test_params>{
|
||||
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_BINARY_ADD_1, 2, 5 },
|
||||
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_SUM_1, 2, 5 },
|
||||
}));
|
||||
|
||||
|
||||
// input:b_fs_yx_fsv16:f16 X weight:bfyx:f16 + eltwise_sum:b_fs_yx_fsv16:f16
|
||||
// After optimization: eltwise_any + sum
|
||||
// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_tanh+sum:1:0:f16
|
||||
class post_ops_optimizations_onednn_sum_full_tensor : public WeightsPrimitiveFusingTestOneDNN {};
|
||||
TEST_P(post_ops_optimizations_onednn_sum_full_tensor, basic) {
|
||||
class onednn_multiple_binary_add_full_tensor : public EltwiseSumFusingTestOneDNN {};
|
||||
TEST_P(onednn_multiple_binary_add_full_tensor, basic) {
|
||||
auto p = GetParam();
|
||||
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
|
||||
data("in_lo1", get_mem(get_single_element_layout(p), 0)),
|
||||
data("in_hi1", get_mem(get_single_element_layout(p), 100)),
|
||||
data("out_lo1", get_mem(get_single_element_layout(p), 0)),
|
||||
data("out_hi1", get_mem(get_single_element_layout(p), 100)),
|
||||
data("eltwise_data", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
|
||||
data("eltwise_data1", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
|
||||
data("eltwise_data2", get_mem(layout{ p.eltw_type, format::bfyx, tensor{ 1, p.out_shape.feature[0], 1, 1 } }, 0, 100)),
|
||||
convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, false),
|
||||
activation("activation", "conv_prim", activation_func::hyperbolic_tan),
|
||||
data("eltwise_data", get_mem(get_output_layout(p), 0, 255)),
|
||||
eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
|
||||
reorder("reorder_bfyx", "sum", p.default_format, data_types::f32)
|
||||
quantize("quantize1", "activation", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 256, p.out_type),
|
||||
eltwise("sum", { "quantize1", "eltwise_data" }, eltwise_mode::sum, p.out_type), // eltwise sum with full tensor
|
||||
eltwise("sum1", { "sum", "eltwise_data1" }, eltwise_mode::sum, p.out_type), // eltwise sum with full tensor
|
||||
eltwise("sum2", { "sum1", "eltwise_data2" }, eltwise_mode::sum, p.out_type), // eltwise sum with broadcasting
|
||||
reorder("reorder_bfyx", "sum2", p.default_format, p.default_type)
|
||||
);
|
||||
|
||||
tolerance = 1.f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
#define CASE_CONV_F16F16_FT_ELTW_SUM_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, tensor{ 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx, data_types::f32, format::bfyx
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_sum_full_tensor, ::testing::ValuesIn(std::vector<convolution_test_params>{
|
||||
// cases with batch = 1
|
||||
convolution_test_params{ CASE_CONV_F16F16_FT_ELTW_SUM_1, 2, 4 },
|
||||
INSTANTIATE_TEST_SUITE_P(multiple_eltwise_sum_fusings_gpu, onednn_multiple_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_eltw_sum_test_params>{
|
||||
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_BINARY_ADD_1, 2, 7 },
|
||||
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_SUM_1, 2, 7 },
|
||||
}));
|
||||
|
||||
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
||||
@@ -227,4 +227,10 @@ public:
|
||||
topology_fused.add(args...);
|
||||
topology_non_fused.add(args...);
|
||||
}
|
||||
|
||||
template <class... Args>
|
||||
void add_topologies(Args const&... args) {
|
||||
topology_fused.add(args...);
|
||||
topology_non_fused.add(args...);
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user