[GPU] Fixed fused_primitive_desc to have -1 value for dep_start_idx (#17099)
* Fixed fused_primitive_desc to have -1 value for dep_start_idxt b * Fixed dgpu i8 errors
This commit is contained in:
parent
3830125e3b
commit
ce23ce00f1
@ -44,7 +44,7 @@ struct fused_primitive_desc {
|
||||
bool operator==(const fused_primitive_desc& rhs) const {
|
||||
if (total_num_deps != rhs.total_num_deps)
|
||||
return false;
|
||||
if (dep_start_idx != rhs.dep_start_idx)
|
||||
if (outer_dep_start_idx != rhs.outer_dep_start_idx)
|
||||
return false;
|
||||
|
||||
return *desc == *rhs.desc;
|
||||
@ -52,6 +52,8 @@ struct fused_primitive_desc {
|
||||
|
||||
bool operator!=(const fused_primitive_desc& rhs) const { return !(*this == rhs); }
|
||||
|
||||
bool has_outer_dep() const { return outer_dep_start_idx >= 0; }
|
||||
|
||||
std::shared_ptr<const primitive> desc;
|
||||
|
||||
layout input_layout = layout(data_types::f32, format::bfyx, tensor());
|
||||
@ -61,7 +63,11 @@ struct fused_primitive_desc {
|
||||
|
||||
std::vector<std::pair<primitive_id, size_t>> deps;
|
||||
std::map<primitive_id, size_t> fused_deps;
|
||||
size_t dep_start_idx;
|
||||
// TODO:
|
||||
// Currently, it assumes very simple case where dep 0 is the fused node and no input sharing b/w fused node and peer node
|
||||
// To cover such cases where some of the peer node uses input of fused node, we need to maintain actual indexes of the dependencies
|
||||
// not only the "starting index".
|
||||
int32_t outer_dep_start_idx = -1; // if -1, no external dep after fusing
|
||||
size_t total_num_deps = 0;
|
||||
};
|
||||
|
||||
|
@ -73,10 +73,9 @@ void add_required_reorders::run(program& p) {
|
||||
if (!fused_op.is_type<eltwise>() && !(fused_op.is_type<activation>() && fused_op.total_num_deps == 2))
|
||||
continue;
|
||||
|
||||
auto dep_id = fused_op.dep_start_idx;
|
||||
if (dep_id >= usr->get_dependencies().size())
|
||||
if (!fused_op.has_outer_dep())
|
||||
continue;
|
||||
|
||||
auto dep_id = fused_op.outer_dep_start_idx;
|
||||
auto& dep = usr->get_dependency(dep_id);
|
||||
if (!dep.is_type<data>())
|
||||
continue;
|
||||
|
@ -43,8 +43,9 @@ void basic_memory_dependencies::run(program& p) {
|
||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
|
||||
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
|
||||
continue;
|
||||
|
||||
eltw_dep = fused_op.dep_start_idx;
|
||||
if (!fused_op.has_outer_dep())
|
||||
continue;
|
||||
eltw_dep = fused_op.outer_dep_start_idx;
|
||||
auto& eltw_node = node->get_dependency(eltw_dep);
|
||||
eltw_node.can_share_buffer(false);
|
||||
node->can_share_buffer(false);
|
||||
|
@ -545,7 +545,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
local_desc.input_layout = input.get_dependency(0).get_output_layout(); // original convolution's output layout
|
||||
node->set_input_layout(local_desc.input_layout);
|
||||
local_desc.f_param = node->get_fuse_params();
|
||||
local_desc.dep_start_idx = input.get_fused_primitives().size();
|
||||
local_desc.outer_dep_start_idx = -1;
|
||||
local_desc.output_layout = output_layout;
|
||||
input.add_fused_primitive(local_desc);
|
||||
|
||||
|
@ -862,8 +862,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
onednn_add_fusing_helpers::for_eltwise(conv_node, eltwise_mode::sum,
|
||||
[&](const program_node& p_node, const fused_primitive_desc& desc) {
|
||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(p_node, desc);
|
||||
if (fusing_type == add_fusing_type::binary_per_tensor) {
|
||||
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
|
||||
if (fusing_type == add_fusing_type::binary_per_tensor && desc.has_outer_dep()) {
|
||||
auto& dep_node = p_node.get_dependency(desc.outer_dep_start_idx);
|
||||
auto d_layout = dep_node.get_output_layout();
|
||||
auto d_format = d_layout.format;
|
||||
auto expected_format = format::any;
|
||||
@ -885,9 +885,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
new_layout.format = expected_format;
|
||||
auto new_input = rf.get_reorder(dep_node.id(), d_layout, new_layout);
|
||||
if (new_input.first) {
|
||||
p.add_intermediate(new_input.first, conv_node, desc.dep_start_idx, !new_input.second);
|
||||
p.add_intermediate(new_input.first, conv_node, desc.outer_dep_start_idx, !new_input.second);
|
||||
}
|
||||
conv_node.get_dependency(desc.dep_start_idx).set_output_layout(new_layout, false);
|
||||
conv_node.get_dependency(desc.outer_dep_start_idx).set_output_layout(new_layout, false);
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -965,7 +965,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
if (activation_desc->activation_function == cldnn::activation_func::relu_negative_slope &&
|
||||
!activation_desc->additional_params_input.empty()) {
|
||||
const auto expected_dt = data_types::f32;
|
||||
const auto dep_idx = fused_desc.dep_start_idx;
|
||||
const auto dep_idx = fused_desc.outer_dep_start_idx;
|
||||
const auto orig_layout = node->get_dependency(dep_idx).get_output_layout();
|
||||
if (orig_layout.data_type == expected_dt)
|
||||
continue;
|
||||
@ -992,7 +992,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
for (const auto& fused_prim : node->get_fused_primitives()) {
|
||||
if (fused_prim.is_type<eltwise>() &&
|
||||
one_of(fused_prim.typed_desc<eltwise>()->mode, {eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod})) {
|
||||
auto& data = node->get_dependency(fused_prim.dep_start_idx);
|
||||
auto& data = node->get_dependency(fused_prim.outer_dep_start_idx);
|
||||
|
||||
auto gemm_layout = node->get_output_layout();
|
||||
auto data_layout = data.get_output_layout();
|
||||
@ -1016,7 +1016,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
auto broadcast_prim = std::make_shared<cldnn::broadcast>(prim_id, cldnn::input_info(data.id()), gemm_layout.get_shape(), ov::AxisSet{});
|
||||
|
||||
auto& broadcast_node = p.get_or_create(broadcast_prim);
|
||||
p.add_intermediate(broadcast_node, *node, fused_prim.dep_start_idx, true);
|
||||
p.add_intermediate(broadcast_node, *node, fused_prim.outer_dep_start_idx, true);
|
||||
broadcast_node.recalc_output_layouts(false);
|
||||
}
|
||||
}
|
||||
@ -1025,7 +1025,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
if (fused_prim.is_type<eltwise>() &&
|
||||
one_of(fused_prim.typed_desc<eltwise>()->mode, {eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod})) {
|
||||
auto fc_layout = node->get_output_layout();
|
||||
auto& data = node->get_dependency(fused_prim.dep_start_idx);
|
||||
auto& data = node->get_dependency(fused_prim.outer_dep_start_idx);
|
||||
auto data_layout = data.get_output_layout();
|
||||
|
||||
if (fc_layout.is_dynamic() || data_layout.is_dynamic())
|
||||
@ -1060,7 +1060,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
auto broadcast_prim = std::make_shared<cldnn::broadcast>(prim_id, cldnn::input_info(data.id()), fc_layout.get_shape(), ov::AxisSet{});
|
||||
|
||||
auto& broadcast_node = p.get_or_create(broadcast_prim);
|
||||
p.add_intermediate(broadcast_node, *node, fused_prim.dep_start_idx, true);
|
||||
p.add_intermediate(broadcast_node, *node, fused_prim.outer_dep_start_idx, true);
|
||||
broadcast_node.recalc_output_layouts(false);
|
||||
}
|
||||
}
|
||||
|
@ -147,14 +147,15 @@ inline params_t get_default_params(const kernel_impl_params& param_info, bool is
|
||||
OPENVINO_ASSERT(desc.op_params != nullptr, "[GPU] Invalid fused operation (", param_info.desc->id , ") of type ", param_info.desc->type_string());
|
||||
|
||||
|
||||
desc.dep_idx_start = fused_prim.dep_start_idx;
|
||||
desc.dep_idx_start = fused_prim.outer_dep_start_idx;
|
||||
desc.dep_size = fused_prim.deps.size();
|
||||
desc.op_id = op_id++;
|
||||
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
|
||||
prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
|
||||
|
||||
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
|
||||
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
|
||||
if (fused_prim.has_outer_dep()) {
|
||||
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
|
||||
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
|
||||
}
|
||||
}
|
||||
|
||||
if (fused_prim.total_num_deps > 0) {
|
||||
@ -334,7 +335,7 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im
|
||||
if (fd.is_type<eltwise>() && fd.total_num_deps == 2) {
|
||||
auto out_pshape = updated_impl_params.output_layouts[0].get_partial_shape();
|
||||
|
||||
auto& dep_layout = updated_impl_params.input_layouts[fd.dep_start_idx];
|
||||
auto& dep_layout = updated_impl_params.input_layouts[fd.outer_dep_start_idx];
|
||||
auto dep_shape = dep_layout.get_partial_shape();
|
||||
|
||||
if (!broadcastable(dep_shape, out_pshape, use_new_shape_infer)) {
|
||||
|
@ -201,6 +201,7 @@ public:
|
||||
bool can_share_buffer() const { return _can_share_buffer; }
|
||||
bool is_constant() const { return _is_constant; }
|
||||
bool is_output_event() const { return _is_output_event; }
|
||||
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
|
||||
|
||||
void allocate_internal_buffers();
|
||||
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
|
||||
|
@ -104,6 +104,24 @@ public:
|
||||
|
||||
bool is_fused_dep(size_t dep_idx) const;
|
||||
|
||||
bool has_fused_dep() const {
|
||||
for (auto fused : get_fused_primitives()) {
|
||||
if (fused.has_outer_dep())
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t get_first_fused_dep_idx() const {
|
||||
if (!has_fused_dep())
|
||||
return -1;
|
||||
for (auto fused : get_fused_primitives()) {
|
||||
if (fused.has_outer_dep())
|
||||
return fused.outer_dep_start_idx;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::map<size_t, memory::ptr> get_const_memory_deps() const;
|
||||
|
||||
virtual std::unique_ptr<kernel_impl_params> get_kernel_impl_params() const {
|
||||
|
@ -405,7 +405,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
for (auto& p : next.get_fused_primitives()) {
|
||||
// find eltwise sum primitive which has dependency nodes, and gather dependency indices of it.
|
||||
if (p.is_type<eltwise>() && p.typed_desc<eltwise>()->mode == eltwise_mode::sum) {
|
||||
for (size_t i = p.dep_start_idx; i < p.dep_start_idx + p.total_num_deps; i++) {
|
||||
for (size_t i = p.outer_dep_start_idx; i < p.outer_dep_start_idx + p.total_num_deps; i++) {
|
||||
dep_idx_set.insert(i);
|
||||
}
|
||||
}
|
||||
|
@ -604,7 +604,9 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
|
||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
|
||||
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
|
||||
continue;
|
||||
eltw_dep = fused_op.dep_start_idx;
|
||||
if (!fused_op.has_outer_dep())
|
||||
continue;
|
||||
eltw_dep = fused_op.outer_dep_start_idx;
|
||||
auto& eltw_in = node->get_dependency(eltw_dep);
|
||||
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
|
||||
reuse_map[node->id()] = eltw_in.id();
|
||||
@ -1007,7 +1009,9 @@ void network::allocate_primitives() {
|
||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
|
||||
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
|
||||
continue;
|
||||
eltw_dep = fused_op.dep_start_idx;
|
||||
if (!fused_op.has_outer_dep())
|
||||
continue;
|
||||
eltw_dep = fused_op.outer_dep_start_idx;
|
||||
auto& eltw_in = node->get_dependency(eltw_dep);
|
||||
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
|
||||
auto& eltw_inst = _primitives.at(eltw_in.id());
|
||||
|
@ -600,7 +600,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
|
||||
, _inputs_memory_count(node.get_primitive()->input_size())
|
||||
, _outputs_memory_count(node.get_primitive()->output_size())
|
||||
, _fused_mem_count(node.get_fused_inputs_count())
|
||||
, _fused_mem_offset(_fused_mem_count > 0 ? node.get_fused_primitives()[0].dep_start_idx : 0)
|
||||
, _fused_mem_offset((_fused_mem_count > 0 && node.has_fused_dep()) ? node.get_first_fused_dep_idx() : 0)
|
||||
, _can_be_optimized(node.can_be_optimized())
|
||||
, _can_share_buffer(node.can_share_buffer())
|
||||
, _is_constant(node.is_constant()) {
|
||||
@ -971,7 +971,7 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
|
||||
if (!_unfused_subgraph) {
|
||||
topology t;
|
||||
|
||||
std::vector<primitive_id> dep_ids;
|
||||
std::vector<primitive_id> outer_dep_ids;
|
||||
// Add input primitives: constants are moved as is
|
||||
// Any other primitive types are replaced with input_layout
|
||||
for (auto& dep : _node->get_dependencies()) {
|
||||
@ -985,12 +985,12 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
|
||||
input_layout in_prim(dep.first->id(), dep.first->get_output_layout());
|
||||
t.add(in_prim);
|
||||
}
|
||||
dep_ids.push_back(dep.first->id());
|
||||
outer_dep_ids.push_back(dep.first->id());
|
||||
}
|
||||
|
||||
// Create the primitive itself
|
||||
t.add_primitive(std::const_pointer_cast<primitive>(_node->get_primitive()));
|
||||
dep_ids.push_back(_node->id());
|
||||
outer_dep_ids.push_back(_node->id());
|
||||
|
||||
// Add primitives for fused-ops
|
||||
for (auto& fd : _impl_params->fused_desc) {
|
||||
@ -1008,25 +1008,26 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
|
||||
// And when we construct unfused subgraph for prim2, we take original eltwise2 primitive which expects eltwise1 primitive as input
|
||||
// which doesn't exist anymore in the graph
|
||||
// Thus we update dependency name used dependencies idx stored in fused descriptor.
|
||||
if (std::find_if(dep_ids.begin(), dep_ids.end(),
|
||||
[&](const primitive_id& pid) {
|
||||
return pid == in.pid;
|
||||
}) == dep_ids.end()) {
|
||||
size_t dep_id = fd.dep_start_idx;
|
||||
in = _node->get_dependency(dep_id).id();
|
||||
if (fd.has_outer_dep()) {
|
||||
if (std::find_if(outer_dep_ids.begin(), outer_dep_ids.end(), [&](const primitive_id& pid) {
|
||||
return pid == in.pid;
|
||||
}) == outer_dep_ids.end()) {
|
||||
size_t dep_id = fd.outer_dep_start_idx;
|
||||
in = _node->get_dependency(dep_id).id();
|
||||
}
|
||||
}
|
||||
}
|
||||
t.add_primitive(prim);
|
||||
dep_ids.push_back(prim->id);
|
||||
outer_dep_ids.push_back(prim->id);
|
||||
}
|
||||
// Samely, need to update dependency of the current fused nodes' input primitive ids with those in the current program
|
||||
auto prim_of_fused_node = std::const_pointer_cast<primitive>(_impl_params->desc);
|
||||
for (size_t i = 0; i < prim_of_fused_node->input.size(); ++i) {
|
||||
auto& in = prim_of_fused_node->input[i];
|
||||
if (std::find_if(dep_ids.begin(), dep_ids.end(),
|
||||
if (std::find_if(outer_dep_ids.begin(), outer_dep_ids.end(),
|
||||
[&](const primitive_id& pid) {
|
||||
return pid == in.pid;
|
||||
}) == dep_ids.end()) {
|
||||
}) == outer_dep_ids.end()) {
|
||||
in = _node->get_dependency(i).id();
|
||||
}
|
||||
}
|
||||
@ -1048,11 +1049,12 @@ bool primitive_inst::is_valid_fusion() const {
|
||||
auto fuse_descriptors = _impl_params->fused_desc;
|
||||
if (fuse_descriptors.empty())
|
||||
return true;
|
||||
|
||||
std::vector<fused_primitive_desc> fused_eltwise_prims;
|
||||
for (auto& fd : fuse_descriptors) {
|
||||
if (fd.is_type<eltwise>()) {
|
||||
if (fd.is_type<eltwise>() || fd.is_type<activation>()) {
|
||||
fused_eltwise_prims.push_back(fd);
|
||||
} else {
|
||||
OPENVINO_ASSERT("[GPU] Unsupported fused operation in dynamic shape : ", fd.desc->id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1061,14 +1063,16 @@ bool primitive_inst::is_valid_fusion() const {
|
||||
|
||||
auto out_pshape = _impl_params->get_output_layout().get_partial_shape();
|
||||
for (auto& fd : fused_eltwise_prims) {
|
||||
auto dep_idx = fd.dep_start_idx;
|
||||
OPENVINO_ASSERT(fd.total_num_deps == 2, "[GPU] Unexpected count of dependencies in dynamic fusion for eltwise");
|
||||
OPENVINO_ASSERT(_deps.size() > dep_idx, "[GPU] Invalid fused dependency idx");
|
||||
auto dep = _deps[dep_idx];
|
||||
auto outer_dep_idx = fd.outer_dep_start_idx;
|
||||
if (outer_dep_idx < 0) // no outer dep
|
||||
continue;
|
||||
OPENVINO_ASSERT(fd.total_num_deps == 2, "[GPU] Unexpected count of dependencies in dynamic fusion for eltwise or activation");
|
||||
OPENVINO_ASSERT(outer_dep_idx < 0 || static_cast<int32_t>(_deps.size()) > outer_dep_idx, "[GPU] Invalid fused dependency idx");
|
||||
auto outer_dep = _deps[outer_dep_idx];
|
||||
|
||||
auto dep_pshape = dep.first->_impl_params->get_output_layout().get_partial_shape();
|
||||
auto outer_dep_pshape = outer_dep.first->_impl_params->get_output_layout().get_partial_shape();
|
||||
auto merged_shape = out_pshape;
|
||||
auto can_broadcast = ov::PartialShape::broadcast_merge_into(merged_shape, dep_pshape, fd.typed_desc<eltwise>()->broadcast_spec);
|
||||
auto can_broadcast = ov::PartialShape::broadcast_merge_into(merged_shape, outer_dep_pshape, fd.typed_desc<eltwise>()->broadcast_spec);
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
// WA for OneDNN binary add fusions: we need to broadcast batch dimension to avoid situation with
|
||||
@ -1079,7 +1083,7 @@ bool primitive_inst::is_valid_fusion() const {
|
||||
// correctly and we need to do it manually
|
||||
if (_node->is_type<gemm>() && _node->get_preferred_impl_type() == impl_types::onednn) {
|
||||
auto gemm_layout = _impl_params->get_output_layout();
|
||||
auto data_layout = dep.first->_impl_params->get_output_layout();
|
||||
auto data_layout = outer_dep.first->_impl_params->get_output_layout();
|
||||
auto gemm_dims = onednn::convert_gemm_tensor(gemm_layout.get_tensor(),
|
||||
cldnn::format::dimension(gemm_layout.format),
|
||||
false);
|
||||
|
@ -1077,11 +1077,11 @@ void program::fuse_nodes(program_node &fused_node,
|
||||
auto peer_layout = peer_node.get_output_layout();
|
||||
fused_primitive_desc local_desc(peer_node.get_primitive());
|
||||
local_desc.f_param = get_node_ptr(peer_node.id())->get_fuse_params();
|
||||
local_desc.dep_start_idx = fused_node.get_dependencies().size();
|
||||
local_desc.total_num_deps = peer_node.get_dependencies().size();
|
||||
local_desc.input_layout = peer_node.get_dependency(0).get_output_layout();
|
||||
local_desc.output_layout = peer_layout;
|
||||
|
||||
int32_t orig_fused_node_num_deps = static_cast<int32_t>(fused_node.get_dependencies().size());
|
||||
auto fusedPadding = fused_node.get_output_layout().data_padding;
|
||||
cldnn::padding needed_padding = padding::max(peer_layout.data_padding,
|
||||
fusedPadding);
|
||||
@ -1092,7 +1092,6 @@ void program::fuse_nodes(program_node &fused_node,
|
||||
local_desc.fused_deps.emplace(id.first, id.second);
|
||||
}
|
||||
}
|
||||
|
||||
// Add new dependencies to the fused_node
|
||||
size_t deps_idx = 0;
|
||||
for (size_t i = 0; i < peer_node.get_dependencies().size(); i++) {
|
||||
@ -1129,6 +1128,10 @@ void program::fuse_nodes(program_node &fused_node,
|
||||
local_desc.deps.emplace_back(dep.id(), deps_idx++);
|
||||
dep.users.push_back(&fused_node);
|
||||
}
|
||||
if (local_desc.deps.size()) {
|
||||
local_desc.outer_dep_start_idx = orig_fused_node_num_deps;
|
||||
}
|
||||
|
||||
local_desc.total_num_deps = std::min(local_desc.total_num_deps, deps_idx);
|
||||
|
||||
fused_node.add_fused_primitive(local_desc);
|
||||
|
@ -100,8 +100,10 @@ add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type(
|
||||
if (desc.typed_desc<eltwise>()->mode != eltwise_mode::sum) {
|
||||
return add_fusing_type::not_supported;
|
||||
}
|
||||
|
||||
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
|
||||
if (!desc.has_outer_dep()) {
|
||||
return add_fusing_type::not_supported;
|
||||
}
|
||||
auto& dep_node = p_node.get_dependency(desc.outer_dep_start_idx);
|
||||
auto p_layout = p_node.get_output_layout();
|
||||
auto d_layout = dep_node.get_output_layout();
|
||||
|
||||
|
@ -123,7 +123,7 @@ std::unique_ptr<json_composite> program_node::desc_to_json() const {
|
||||
dep_ids.push_back(dep.first);
|
||||
}
|
||||
fused_node_info.add("dependencies", dep_ids);
|
||||
fused_node_info.add("dep start_idx", fused_desc.dep_start_idx);
|
||||
fused_node_info.add("dep start_idx", fused_desc.outer_dep_start_idx);
|
||||
json_composite info;
|
||||
info.add("data type", dt_to_str(fused_desc.output_layout.data_type));
|
||||
info.add("format", output_layouts[0].format.to_string());
|
||||
@ -380,7 +380,7 @@ bool program_node::has_padded_dependency() const {
|
||||
|
||||
bool program_node::is_fused_dep(size_t dep_idx) const {
|
||||
for (auto fused : get_fused_primitives()) {
|
||||
if (dep_idx >= fused.dep_start_idx) {
|
||||
if (fused.has_outer_dep() && static_cast<int32_t>(dep_idx) >= fused.outer_dep_start_idx) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -944,7 +944,7 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
auto fused_desc = desc.typed_desc<activation>();
|
||||
if (fused_desc->activation_function == cldnn::activation_func::relu_negative_slope
|
||||
&& !fused_desc->additional_params_input.empty()) {
|
||||
auto dep_idx = cldnn_post_ops[idx].dep_start_idx;
|
||||
auto dep_idx = cldnn_post_ops[idx].outer_dep_start_idx;
|
||||
int oc_dim = static_cast<int>(desc.output_layout.get_tensor().feature.size());
|
||||
post_ops.append_prelu(1 << oc_dim);
|
||||
update_onednn_post_op_list(onednn_post_op_type::binary_relu, dep_idx);
|
||||
@ -975,9 +975,8 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
update_onednn_post_op_list(onednn_post_op_type::eltwise_act, empty_mem);
|
||||
}
|
||||
} else if (desc.is_type<eltwise>()) {
|
||||
auto dep_idx = desc.dep_start_idx;
|
||||
auto dep_idx = desc.outer_dep_start_idx;
|
||||
auto in = get_dependency(dep_idx).get_output_layout();
|
||||
|
||||
auto set_binary_op = [&](dnnl::algorithm alg, onednn_post_op_type op_type) {
|
||||
if (is_type<fully_connected>()) {
|
||||
std::unique_ptr<const kernel_impl_params> impl_params = get_kernel_impl_params();
|
||||
@ -988,7 +987,7 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
auto mem_desc = onednn::layout_to_memory_desc(in, dnnl::memory::format_tag::ab);
|
||||
post_ops.append_binary(alg, mem_desc);
|
||||
update_onednn_post_op_list(op_type, dep_idx, dnnl::memory::format_tag::ab, false,
|
||||
mem_desc.get_dims(), mem_desc.get_data_type());
|
||||
mem_desc.get_dims(), mem_desc.get_data_type());
|
||||
} else if (is_type<gemm>()) {
|
||||
size_t rank = cldnn::format::dimension(in.format);
|
||||
size_t in_batched_size = in.count() / (in.spatial(0) * in.spatial(1));
|
||||
@ -1001,7 +1000,7 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
auto mem_desc = onednn::layout_to_memory_desc(in);
|
||||
post_ops.append_binary(alg, mem_desc);
|
||||
update_onednn_post_op_list(op_type, dep_idx, onednn::convert_data_format(in.format), false,
|
||||
mem_desc.get_dims(), mem_desc.get_data_type());
|
||||
mem_desc.get_dims(), mem_desc.get_data_type());
|
||||
}
|
||||
};
|
||||
|
||||
@ -1029,7 +1028,7 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
OPENVINO_ASSERT(false, error_msg.str());
|
||||
}
|
||||
} else if (desc.is_type<quantize>()) {
|
||||
auto dep_idx = desc.dep_start_idx;
|
||||
auto dep_idx = desc.outer_dep_start_idx;
|
||||
|
||||
// ********************************* Common case with output range usage ********************************* //
|
||||
const auto& q_param = desc.get_typed_fuse_params<QuantizeFuseParams>();
|
||||
@ -1248,4 +1247,5 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
}
|
||||
|
||||
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "reduce_inst.h"
|
||||
#include "reshape_inst.h"
|
||||
#include "gemm_inst.h"
|
||||
#include "convolution_inst.h"
|
||||
#include "pass_manager.h"
|
||||
#include "to_string_utils.h"
|
||||
|
||||
@ -440,3 +441,73 @@ TEST(prepare_primitive_fusing, dont_remove_only_dep_reshape) {
|
||||
ASSERT_NE(prog, nullptr);
|
||||
ASSERT_TRUE(has_node(*prog, "reshape2"));
|
||||
}
|
||||
|
||||
TEST(prepare_primitive_fusing, eltwise_fusing_residual_connection_taylor) {
|
||||
// Extended eltwise fusing pattern
|
||||
// in w
|
||||
// \ /
|
||||
// conv elt1_in1
|
||||
// | \ /
|
||||
// | elt1
|
||||
// | |
|
||||
// | act
|
||||
// | /
|
||||
// elt2
|
||||
// |
|
||||
// reorder
|
||||
auto& engine = get_test_engine();
|
||||
topology topology;
|
||||
auto conv_in_layout = layout{ ov::PartialShape{1, 3, -1, -1}, data_types::f16, format::bfyx};
|
||||
auto weight_layout = layout{ ov::PartialShape{10, 3, 3, 3}, data_types::f16, format::bfyx};
|
||||
auto weight_mem = engine.allocate_memory(weight_layout);
|
||||
auto weight_data = generate_random_4d<FLOAT16>(10, 3, 3, 3, -1, 1);
|
||||
set_values(weight_mem, weight_data);
|
||||
auto elt1_in1_layout = layout{ ov::PartialShape{1, 10, -1, -1}, data_types::f16, format::bfyx};
|
||||
|
||||
topology.add(data("weights", weight_mem));
|
||||
topology.add(input_layout("conv_input", conv_in_layout));
|
||||
topology.add(input_layout("elt1_input", elt1_in1_layout));
|
||||
topology.add(convolution("conv", input_info("conv_input"), { "weights" }));
|
||||
topology.add(eltwise("eltw1", { input_info("conv"), input_info("elt1_input") }, eltwise_mode::prod));
|
||||
topology.add(activation("act", input_info("eltw1"), activation_func::erf));
|
||||
topology.add(eltwise("elt2", { input_info("conv"), input_info("act") }, eltwise_mode::prod));
|
||||
topology.add(reorder("reorder", input_info("elt2"), format::bfyx, data_types::f32));
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
auto prog = program::build_program(engine, topology, config, false, true);
|
||||
|
||||
layout_optimizer lo(true);
|
||||
|
||||
program_wrapper::apply_opt_pass<prepare_primitive_fusing>(*prog, lo);
|
||||
ASSERT_NE(prog, nullptr);
|
||||
ASSERT_FALSE(has_node_with_type<eltwise>(*prog));
|
||||
|
||||
cldnn::network net(prog, 0);
|
||||
|
||||
// Valid
|
||||
auto conv_input_data = generate_random_4d<FLOAT16>(1, 3, 7, 7, -1, 1);
|
||||
auto conv_input_mem = engine.allocate_memory(layout{ov::PartialShape{1, 3, 7, 7}, data_types::f16, format::bfyx});
|
||||
set_values(conv_input_mem, conv_input_data);
|
||||
|
||||
auto elt_input_data = generate_random_4d<FLOAT16>(1, 10, 5, 5, -10, 10);
|
||||
auto elt_input_mem = engine.allocate_memory(layout{ov::PartialShape{1, 10, 5, 5}, data_types::f16, format::bfyx});
|
||||
set_values(elt_input_mem, elt_input_data);
|
||||
|
||||
net.set_input_data("conv_input", conv_input_mem);
|
||||
net.set_input_data("elt1_input", elt_input_mem);
|
||||
|
||||
net.execute();
|
||||
const auto& conv_inst = net.get_primitive("conv");
|
||||
ASSERT_FALSE(conv_inst->has_unfused_subgraph());
|
||||
|
||||
// Invalid => unfusion
|
||||
auto conv_input_data2 = generate_random_4d<FLOAT16>(1, 3, 3, 3, -1, 1);
|
||||
auto conv_input_mem2 = engine.allocate_memory(layout{ov::PartialShape{1, 3, 3, 3}, data_types::f16, format::bfyx});
|
||||
set_values(conv_input_mem2, conv_input_data2);
|
||||
net.set_input_data("conv_input", conv_input_mem2);
|
||||
net.set_input_data("elt1_input", elt_input_mem);
|
||||
net.execute();
|
||||
ASSERT_TRUE(conv_inst->has_unfused_subgraph());
|
||||
}
|
Loading…
Reference in New Issue
Block a user