[GPU] Fixed fused_primitive_desc to have -1 value for dep_start_idx (#17099)

* Fixed fused_primitive_desc to have -1 value for dep_start_idxt b

* Fixed dgpu i8 errors
This commit is contained in:
Taylor Yeonbok Lee 2023-04-24 15:21:58 -07:00 committed by GitHub
parent 3830125e3b
commit ce23ce00f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 170 additions and 60 deletions

View File

@ -44,7 +44,7 @@ struct fused_primitive_desc {
bool operator==(const fused_primitive_desc& rhs) const {
if (total_num_deps != rhs.total_num_deps)
return false;
if (dep_start_idx != rhs.dep_start_idx)
if (outer_dep_start_idx != rhs.outer_dep_start_idx)
return false;
return *desc == *rhs.desc;
@ -52,6 +52,8 @@ struct fused_primitive_desc {
bool operator!=(const fused_primitive_desc& rhs) const { return !(*this == rhs); }
bool has_outer_dep() const { return outer_dep_start_idx >= 0; }
std::shared_ptr<const primitive> desc;
layout input_layout = layout(data_types::f32, format::bfyx, tensor());
@ -61,7 +63,11 @@ struct fused_primitive_desc {
std::vector<std::pair<primitive_id, size_t>> deps;
std::map<primitive_id, size_t> fused_deps;
size_t dep_start_idx;
// TODO:
// Currently, it assumes very simple case where dep 0 is the fused node and no input sharing b/w fused node and peer node
// To cover such cases where some of the peer node uses input of fused node, we need to maintain actual indexes of the dependencies
// not only the "starting index".
int32_t outer_dep_start_idx = -1; // if -1, no external dep after fusing
size_t total_num_deps = 0;
};

View File

@ -73,10 +73,9 @@ void add_required_reorders::run(program& p) {
if (!fused_op.is_type<eltwise>() && !(fused_op.is_type<activation>() && fused_op.total_num_deps == 2))
continue;
auto dep_id = fused_op.dep_start_idx;
if (dep_id >= usr->get_dependencies().size())
if (!fused_op.has_outer_dep())
continue;
auto dep_id = fused_op.outer_dep_start_idx;
auto& dep = usr->get_dependency(dep_id);
if (!dep.is_type<data>())
continue;

View File

@ -43,8 +43,9 @@ void basic_memory_dependencies::run(program& p) {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;
eltw_dep = fused_op.dep_start_idx;
if (!fused_op.has_outer_dep())
continue;
eltw_dep = fused_op.outer_dep_start_idx;
auto& eltw_node = node->get_dependency(eltw_dep);
eltw_node.can_share_buffer(false);
node->can_share_buffer(false);

View File

@ -545,7 +545,7 @@ void remove_redundant_reorders::run(program& p) {
local_desc.input_layout = input.get_dependency(0).get_output_layout(); // original convolution's output layout
node->set_input_layout(local_desc.input_layout);
local_desc.f_param = node->get_fuse_params();
local_desc.dep_start_idx = input.get_fused_primitives().size();
local_desc.outer_dep_start_idx = -1;
local_desc.output_layout = output_layout;
input.add_fused_primitive(local_desc);

View File

@ -862,8 +862,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
onednn_add_fusing_helpers::for_eltwise(conv_node, eltwise_mode::sum,
[&](const program_node& p_node, const fused_primitive_desc& desc) {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(p_node, desc);
if (fusing_type == add_fusing_type::binary_per_tensor) {
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
if (fusing_type == add_fusing_type::binary_per_tensor && desc.has_outer_dep()) {
auto& dep_node = p_node.get_dependency(desc.outer_dep_start_idx);
auto d_layout = dep_node.get_output_layout();
auto d_format = d_layout.format;
auto expected_format = format::any;
@ -885,9 +885,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
new_layout.format = expected_format;
auto new_input = rf.get_reorder(dep_node.id(), d_layout, new_layout);
if (new_input.first) {
p.add_intermediate(new_input.first, conv_node, desc.dep_start_idx, !new_input.second);
p.add_intermediate(new_input.first, conv_node, desc.outer_dep_start_idx, !new_input.second);
}
conv_node.get_dependency(desc.dep_start_idx).set_output_layout(new_layout, false);
conv_node.get_dependency(desc.outer_dep_start_idx).set_output_layout(new_layout, false);
}
}
});
@ -965,7 +965,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
if (activation_desc->activation_function == cldnn::activation_func::relu_negative_slope &&
!activation_desc->additional_params_input.empty()) {
const auto expected_dt = data_types::f32;
const auto dep_idx = fused_desc.dep_start_idx;
const auto dep_idx = fused_desc.outer_dep_start_idx;
const auto orig_layout = node->get_dependency(dep_idx).get_output_layout();
if (orig_layout.data_type == expected_dt)
continue;
@ -992,7 +992,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
for (const auto& fused_prim : node->get_fused_primitives()) {
if (fused_prim.is_type<eltwise>() &&
one_of(fused_prim.typed_desc<eltwise>()->mode, {eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod})) {
auto& data = node->get_dependency(fused_prim.dep_start_idx);
auto& data = node->get_dependency(fused_prim.outer_dep_start_idx);
auto gemm_layout = node->get_output_layout();
auto data_layout = data.get_output_layout();
@ -1016,7 +1016,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
auto broadcast_prim = std::make_shared<cldnn::broadcast>(prim_id, cldnn::input_info(data.id()), gemm_layout.get_shape(), ov::AxisSet{});
auto& broadcast_node = p.get_or_create(broadcast_prim);
p.add_intermediate(broadcast_node, *node, fused_prim.dep_start_idx, true);
p.add_intermediate(broadcast_node, *node, fused_prim.outer_dep_start_idx, true);
broadcast_node.recalc_output_layouts(false);
}
}
@ -1025,7 +1025,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
if (fused_prim.is_type<eltwise>() &&
one_of(fused_prim.typed_desc<eltwise>()->mode, {eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod})) {
auto fc_layout = node->get_output_layout();
auto& data = node->get_dependency(fused_prim.dep_start_idx);
auto& data = node->get_dependency(fused_prim.outer_dep_start_idx);
auto data_layout = data.get_output_layout();
if (fc_layout.is_dynamic() || data_layout.is_dynamic())
@ -1060,7 +1060,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
auto broadcast_prim = std::make_shared<cldnn::broadcast>(prim_id, cldnn::input_info(data.id()), fc_layout.get_shape(), ov::AxisSet{});
auto& broadcast_node = p.get_or_create(broadcast_prim);
p.add_intermediate(broadcast_node, *node, fused_prim.dep_start_idx, true);
p.add_intermediate(broadcast_node, *node, fused_prim.outer_dep_start_idx, true);
broadcast_node.recalc_output_layouts(false);
}
}

View File

@ -147,14 +147,15 @@ inline params_t get_default_params(const kernel_impl_params& param_info, bool is
OPENVINO_ASSERT(desc.op_params != nullptr, "[GPU] Invalid fused operation (", param_info.desc->id , ") of type ", param_info.desc->type_string());
desc.dep_idx_start = fused_prim.dep_start_idx;
desc.dep_idx_start = fused_prim.outer_dep_start_idx;
desc.dep_size = fused_prim.deps.size();
desc.op_id = op_id++;
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
if (fused_prim.has_outer_dep()) {
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
}
}
if (fused_prim.total_num_deps > 0) {
@ -334,7 +335,7 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im
if (fd.is_type<eltwise>() && fd.total_num_deps == 2) {
auto out_pshape = updated_impl_params.output_layouts[0].get_partial_shape();
auto& dep_layout = updated_impl_params.input_layouts[fd.dep_start_idx];
auto& dep_layout = updated_impl_params.input_layouts[fd.outer_dep_start_idx];
auto dep_shape = dep_layout.get_partial_shape();
if (!broadcastable(dep_shape, out_pshape, use_new_shape_infer)) {

View File

@ -201,6 +201,7 @@ public:
bool can_share_buffer() const { return _can_share_buffer; }
bool is_constant() const { return _is_constant; }
bool is_output_event() const { return _is_output_event; }
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
void allocate_internal_buffers();
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,

View File

@ -104,6 +104,24 @@ public:
bool is_fused_dep(size_t dep_idx) const;
bool has_fused_dep() const {
for (auto fused : get_fused_primitives()) {
if (fused.has_outer_dep())
return true;
}
return false;
}
int32_t get_first_fused_dep_idx() const {
if (!has_fused_dep())
return -1;
for (auto fused : get_fused_primitives()) {
if (fused.has_outer_dep())
return fused.outer_dep_start_idx;
}
return -1;
}
std::map<size_t, memory::ptr> get_const_memory_deps() const;
virtual std::unique_ptr<kernel_impl_params> get_kernel_impl_params() const {

View File

@ -405,7 +405,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
for (auto& p : next.get_fused_primitives()) {
// find eltwise sum primitive which has dependency nodes, and gather dependency indices of it.
if (p.is_type<eltwise>() && p.typed_desc<eltwise>()->mode == eltwise_mode::sum) {
for (size_t i = p.dep_start_idx; i < p.dep_start_idx + p.total_num_deps; i++) {
for (size_t i = p.outer_dep_start_idx; i < p.outer_dep_start_idx + p.total_num_deps; i++) {
dep_idx_set.insert(i);
}
}

View File

@ -604,7 +604,9 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;
eltw_dep = fused_op.dep_start_idx;
if (!fused_op.has_outer_dep())
continue;
eltw_dep = fused_op.outer_dep_start_idx;
auto& eltw_in = node->get_dependency(eltw_dep);
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
reuse_map[node->id()] = eltw_in.id();
@ -1007,7 +1009,9 @@ void network::allocate_primitives() {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;
eltw_dep = fused_op.dep_start_idx;
if (!fused_op.has_outer_dep())
continue;
eltw_dep = fused_op.outer_dep_start_idx;
auto& eltw_in = node->get_dependency(eltw_dep);
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
auto& eltw_inst = _primitives.at(eltw_in.id());

View File

@ -600,7 +600,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
, _inputs_memory_count(node.get_primitive()->input_size())
, _outputs_memory_count(node.get_primitive()->output_size())
, _fused_mem_count(node.get_fused_inputs_count())
, _fused_mem_offset(_fused_mem_count > 0 ? node.get_fused_primitives()[0].dep_start_idx : 0)
, _fused_mem_offset((_fused_mem_count > 0 && node.has_fused_dep()) ? node.get_first_fused_dep_idx() : 0)
, _can_be_optimized(node.can_be_optimized())
, _can_share_buffer(node.can_share_buffer())
, _is_constant(node.is_constant()) {
@ -971,7 +971,7 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
if (!_unfused_subgraph) {
topology t;
std::vector<primitive_id> dep_ids;
std::vector<primitive_id> outer_dep_ids;
// Add input primitives: constants are moved as is
// Any other primitive types are replaced with input_layout
for (auto& dep : _node->get_dependencies()) {
@ -985,12 +985,12 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
input_layout in_prim(dep.first->id(), dep.first->get_output_layout());
t.add(in_prim);
}
dep_ids.push_back(dep.first->id());
outer_dep_ids.push_back(dep.first->id());
}
// Create the primitive itself
t.add_primitive(std::const_pointer_cast<primitive>(_node->get_primitive()));
dep_ids.push_back(_node->id());
outer_dep_ids.push_back(_node->id());
// Add primitives for fused-ops
for (auto& fd : _impl_params->fused_desc) {
@ -1008,25 +1008,26 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
// And when we construct unfused subgraph for prim2, we take original eltwise2 primitive which expects eltwise1 primitive as input
// which doesn't exist anymore in the graph
// Thus we update dependency name used dependencies idx stored in fused descriptor.
if (std::find_if(dep_ids.begin(), dep_ids.end(),
[&](const primitive_id& pid) {
return pid == in.pid;
}) == dep_ids.end()) {
size_t dep_id = fd.dep_start_idx;
in = _node->get_dependency(dep_id).id();
if (fd.has_outer_dep()) {
if (std::find_if(outer_dep_ids.begin(), outer_dep_ids.end(), [&](const primitive_id& pid) {
return pid == in.pid;
}) == outer_dep_ids.end()) {
size_t dep_id = fd.outer_dep_start_idx;
in = _node->get_dependency(dep_id).id();
}
}
}
t.add_primitive(prim);
dep_ids.push_back(prim->id);
outer_dep_ids.push_back(prim->id);
}
// Samely, need to update dependency of the current fused nodes' input primitive ids with those in the current program
auto prim_of_fused_node = std::const_pointer_cast<primitive>(_impl_params->desc);
for (size_t i = 0; i < prim_of_fused_node->input.size(); ++i) {
auto& in = prim_of_fused_node->input[i];
if (std::find_if(dep_ids.begin(), dep_ids.end(),
if (std::find_if(outer_dep_ids.begin(), outer_dep_ids.end(),
[&](const primitive_id& pid) {
return pid == in.pid;
}) == dep_ids.end()) {
}) == outer_dep_ids.end()) {
in = _node->get_dependency(i).id();
}
}
@ -1048,11 +1049,12 @@ bool primitive_inst::is_valid_fusion() const {
auto fuse_descriptors = _impl_params->fused_desc;
if (fuse_descriptors.empty())
return true;
std::vector<fused_primitive_desc> fused_eltwise_prims;
for (auto& fd : fuse_descriptors) {
if (fd.is_type<eltwise>()) {
if (fd.is_type<eltwise>() || fd.is_type<activation>()) {
fused_eltwise_prims.push_back(fd);
} else {
OPENVINO_ASSERT("[GPU] Unsupported fused operation in dynamic shape : ", fd.desc->id);
}
}
@ -1061,14 +1063,16 @@ bool primitive_inst::is_valid_fusion() const {
auto out_pshape = _impl_params->get_output_layout().get_partial_shape();
for (auto& fd : fused_eltwise_prims) {
auto dep_idx = fd.dep_start_idx;
OPENVINO_ASSERT(fd.total_num_deps == 2, "[GPU] Unexpected count of dependencies in dynamic fusion for eltwise");
OPENVINO_ASSERT(_deps.size() > dep_idx, "[GPU] Invalid fused dependency idx");
auto dep = _deps[dep_idx];
auto outer_dep_idx = fd.outer_dep_start_idx;
if (outer_dep_idx < 0) // no outer dep
continue;
OPENVINO_ASSERT(fd.total_num_deps == 2, "[GPU] Unexpected count of dependencies in dynamic fusion for eltwise or activation");
OPENVINO_ASSERT(outer_dep_idx < 0 || static_cast<int32_t>(_deps.size()) > outer_dep_idx, "[GPU] Invalid fused dependency idx");
auto outer_dep = _deps[outer_dep_idx];
auto dep_pshape = dep.first->_impl_params->get_output_layout().get_partial_shape();
auto outer_dep_pshape = outer_dep.first->_impl_params->get_output_layout().get_partial_shape();
auto merged_shape = out_pshape;
auto can_broadcast = ov::PartialShape::broadcast_merge_into(merged_shape, dep_pshape, fd.typed_desc<eltwise>()->broadcast_spec);
auto can_broadcast = ov::PartialShape::broadcast_merge_into(merged_shape, outer_dep_pshape, fd.typed_desc<eltwise>()->broadcast_spec);
#ifdef ENABLE_ONEDNN_FOR_GPU
// WA for OneDNN binary add fusions: we need to broadcast batch dimension to avoid situation with
@ -1079,7 +1083,7 @@ bool primitive_inst::is_valid_fusion() const {
// correctly and we need to do it manually
if (_node->is_type<gemm>() && _node->get_preferred_impl_type() == impl_types::onednn) {
auto gemm_layout = _impl_params->get_output_layout();
auto data_layout = dep.first->_impl_params->get_output_layout();
auto data_layout = outer_dep.first->_impl_params->get_output_layout();
auto gemm_dims = onednn::convert_gemm_tensor(gemm_layout.get_tensor(),
cldnn::format::dimension(gemm_layout.format),
false);

View File

@ -1077,11 +1077,11 @@ void program::fuse_nodes(program_node &fused_node,
auto peer_layout = peer_node.get_output_layout();
fused_primitive_desc local_desc(peer_node.get_primitive());
local_desc.f_param = get_node_ptr(peer_node.id())->get_fuse_params();
local_desc.dep_start_idx = fused_node.get_dependencies().size();
local_desc.total_num_deps = peer_node.get_dependencies().size();
local_desc.input_layout = peer_node.get_dependency(0).get_output_layout();
local_desc.output_layout = peer_layout;
int32_t orig_fused_node_num_deps = static_cast<int32_t>(fused_node.get_dependencies().size());
auto fusedPadding = fused_node.get_output_layout().data_padding;
cldnn::padding needed_padding = padding::max(peer_layout.data_padding,
fusedPadding);
@ -1092,7 +1092,6 @@ void program::fuse_nodes(program_node &fused_node,
local_desc.fused_deps.emplace(id.first, id.second);
}
}
// Add new dependencies to the fused_node
size_t deps_idx = 0;
for (size_t i = 0; i < peer_node.get_dependencies().size(); i++) {
@ -1129,6 +1128,10 @@ void program::fuse_nodes(program_node &fused_node,
local_desc.deps.emplace_back(dep.id(), deps_idx++);
dep.users.push_back(&fused_node);
}
if (local_desc.deps.size()) {
local_desc.outer_dep_start_idx = orig_fused_node_num_deps;
}
local_desc.total_num_deps = std::min(local_desc.total_num_deps, deps_idx);
fused_node.add_fused_primitive(local_desc);

View File

@ -100,8 +100,10 @@ add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type(
if (desc.typed_desc<eltwise>()->mode != eltwise_mode::sum) {
return add_fusing_type::not_supported;
}
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
if (!desc.has_outer_dep()) {
return add_fusing_type::not_supported;
}
auto& dep_node = p_node.get_dependency(desc.outer_dep_start_idx);
auto p_layout = p_node.get_output_layout();
auto d_layout = dep_node.get_output_layout();

View File

@ -123,7 +123,7 @@ std::unique_ptr<json_composite> program_node::desc_to_json() const {
dep_ids.push_back(dep.first);
}
fused_node_info.add("dependencies", dep_ids);
fused_node_info.add("dep start_idx", fused_desc.dep_start_idx);
fused_node_info.add("dep start_idx", fused_desc.outer_dep_start_idx);
json_composite info;
info.add("data type", dt_to_str(fused_desc.output_layout.data_type));
info.add("format", output_layouts[0].format.to_string());
@ -380,7 +380,7 @@ bool program_node::has_padded_dependency() const {
bool program_node::is_fused_dep(size_t dep_idx) const {
for (auto fused : get_fused_primitives()) {
if (dep_idx >= fused.dep_start_idx) {
if (fused.has_outer_dep() && static_cast<int32_t>(dep_idx) >= fused.outer_dep_start_idx) {
return true;
}
}
@ -944,7 +944,7 @@ void program_node::init_onednn_primitive_attributes() {
auto fused_desc = desc.typed_desc<activation>();
if (fused_desc->activation_function == cldnn::activation_func::relu_negative_slope
&& !fused_desc->additional_params_input.empty()) {
auto dep_idx = cldnn_post_ops[idx].dep_start_idx;
auto dep_idx = cldnn_post_ops[idx].outer_dep_start_idx;
int oc_dim = static_cast<int>(desc.output_layout.get_tensor().feature.size());
post_ops.append_prelu(1 << oc_dim);
update_onednn_post_op_list(onednn_post_op_type::binary_relu, dep_idx);
@ -975,9 +975,8 @@ void program_node::init_onednn_primitive_attributes() {
update_onednn_post_op_list(onednn_post_op_type::eltwise_act, empty_mem);
}
} else if (desc.is_type<eltwise>()) {
auto dep_idx = desc.dep_start_idx;
auto dep_idx = desc.outer_dep_start_idx;
auto in = get_dependency(dep_idx).get_output_layout();
auto set_binary_op = [&](dnnl::algorithm alg, onednn_post_op_type op_type) {
if (is_type<fully_connected>()) {
std::unique_ptr<const kernel_impl_params> impl_params = get_kernel_impl_params();
@ -988,7 +987,7 @@ void program_node::init_onednn_primitive_attributes() {
auto mem_desc = onednn::layout_to_memory_desc(in, dnnl::memory::format_tag::ab);
post_ops.append_binary(alg, mem_desc);
update_onednn_post_op_list(op_type, dep_idx, dnnl::memory::format_tag::ab, false,
mem_desc.get_dims(), mem_desc.get_data_type());
mem_desc.get_dims(), mem_desc.get_data_type());
} else if (is_type<gemm>()) {
size_t rank = cldnn::format::dimension(in.format);
size_t in_batched_size = in.count() / (in.spatial(0) * in.spatial(1));
@ -1001,7 +1000,7 @@ void program_node::init_onednn_primitive_attributes() {
auto mem_desc = onednn::layout_to_memory_desc(in);
post_ops.append_binary(alg, mem_desc);
update_onednn_post_op_list(op_type, dep_idx, onednn::convert_data_format(in.format), false,
mem_desc.get_dims(), mem_desc.get_data_type());
mem_desc.get_dims(), mem_desc.get_data_type());
}
};
@ -1029,7 +1028,7 @@ void program_node::init_onednn_primitive_attributes() {
OPENVINO_ASSERT(false, error_msg.str());
}
} else if (desc.is_type<quantize>()) {
auto dep_idx = desc.dep_start_idx;
auto dep_idx = desc.outer_dep_start_idx;
// ********************************* Common case with output range usage ********************************* //
const auto& q_param = desc.get_typed_fuse_params<QuantizeFuseParams>();
@ -1248,4 +1247,5 @@ void program_node::init_onednn_primitive_attributes() {
}
#endif // ENABLE_ONEDNN_FOR_GPU
#endif // ENABLE_ONEDNN_FOR_GPU

View File

@ -13,6 +13,7 @@
#include "reduce_inst.h"
#include "reshape_inst.h"
#include "gemm_inst.h"
#include "convolution_inst.h"
#include "pass_manager.h"
#include "to_string_utils.h"
@ -440,3 +441,73 @@ TEST(prepare_primitive_fusing, dont_remove_only_dep_reshape) {
ASSERT_NE(prog, nullptr);
ASSERT_TRUE(has_node(*prog, "reshape2"));
}
TEST(prepare_primitive_fusing, eltwise_fusing_residual_connection_taylor) {
// Extended eltwise fusing pattern
// in w
// \ /
// conv elt1_in1
// | \ /
// | elt1
// | |
// | act
// | /
// elt2
// |
// reorder
auto& engine = get_test_engine();
topology topology;
auto conv_in_layout = layout{ ov::PartialShape{1, 3, -1, -1}, data_types::f16, format::bfyx};
auto weight_layout = layout{ ov::PartialShape{10, 3, 3, 3}, data_types::f16, format::bfyx};
auto weight_mem = engine.allocate_memory(weight_layout);
auto weight_data = generate_random_4d<FLOAT16>(10, 3, 3, 3, -1, 1);
set_values(weight_mem, weight_data);
auto elt1_in1_layout = layout{ ov::PartialShape{1, 10, -1, -1}, data_types::f16, format::bfyx};
topology.add(data("weights", weight_mem));
topology.add(input_layout("conv_input", conv_in_layout));
topology.add(input_layout("elt1_input", elt1_in1_layout));
topology.add(convolution("conv", input_info("conv_input"), { "weights" }));
topology.add(eltwise("eltw1", { input_info("conv"), input_info("elt1_input") }, eltwise_mode::prod));
topology.add(activation("act", input_info("eltw1"), activation_func::erf));
topology.add(eltwise("elt2", { input_info("conv"), input_info("act") }, eltwise_mode::prod));
topology.add(reorder("reorder", input_info("elt2"), format::bfyx, data_types::f32));
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto prog = program::build_program(engine, topology, config, false, true);
layout_optimizer lo(true);
program_wrapper::apply_opt_pass<prepare_primitive_fusing>(*prog, lo);
ASSERT_NE(prog, nullptr);
ASSERT_FALSE(has_node_with_type<eltwise>(*prog));
cldnn::network net(prog, 0);
// Valid
auto conv_input_data = generate_random_4d<FLOAT16>(1, 3, 7, 7, -1, 1);
auto conv_input_mem = engine.allocate_memory(layout{ov::PartialShape{1, 3, 7, 7}, data_types::f16, format::bfyx});
set_values(conv_input_mem, conv_input_data);
auto elt_input_data = generate_random_4d<FLOAT16>(1, 10, 5, 5, -10, 10);
auto elt_input_mem = engine.allocate_memory(layout{ov::PartialShape{1, 10, 5, 5}, data_types::f16, format::bfyx});
set_values(elt_input_mem, elt_input_data);
net.set_input_data("conv_input", conv_input_mem);
net.set_input_data("elt1_input", elt_input_mem);
net.execute();
const auto& conv_inst = net.get_primitive("conv");
ASSERT_FALSE(conv_inst->has_unfused_subgraph());
// Invalid => unfusion
auto conv_input_data2 = generate_random_4d<FLOAT16>(1, 3, 3, 3, -1, 1);
auto conv_input_mem2 = engine.allocate_memory(layout{ov::PartialShape{1, 3, 3, 3}, data_types::f16, format::bfyx});
set_values(conv_input_mem2, conv_input_data2);
net.set_input_data("conv_input", conv_input_mem2);
net.set_input_data("elt1_input", elt_input_mem);
net.execute();
ASSERT_TRUE(conv_inst->has_unfused_subgraph());
}