[GPU] Enable runtime buffer fusing for dynamic shape (#17668)

* Initial impl for runtime buffer fusing
Passing unittest with static kernel

* pass unittest with dynamic impl

* Refactor allocate_output

* Separate header of buffer fusing

* Refactored buffer fusing :: matcher/optimize

* More cleanup

* Fix crash in dolly

* Reset can_be_optimized of primitive_inst when it is not

* Fix empty tensor : Primitive with empty data should be skipped

* Fix issue in dynamic padding : Static kernel should not contain dynamic padding dims
Fix missing reset of update_shape_done_by_other flag

* Not to add cache with emtpy kernel for optimized out inst

* Fix corner case error in buffer fusing
- Shapes of some preds may not be changed, but still needed to do update_impl because 1) paddings are changed 2) output memory should be updated
- optimizable impl should not be added to the cache

* Allowing reorder & permute_ref to be optimized concat predecessor

* Some more fixes :
runtime buffer fusing is available only when all preds/concat are dynamic
runtime buffer fusing is to be executed only if the node is dynamic

* Fix allocate_output parameter called by get_estimated_device_mem_usage according to the new change

* Fixed error in cascaded concatt

* Need to reinterprete even though the size is same
This commit is contained in:
Taylor Yeonbok Lee 2023-06-02 12:39:28 -07:00 committed by GitHub
parent c3a54b0a6e
commit f670dc5a0d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 571 additions and 277 deletions

View File

@ -1,7 +1,7 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "prepare_buffer_fusing.h"
#include "pooling_inst.h"
#include "primitive_inst.h"
#include "activation_inst.h"
@ -25,45 +25,7 @@
using namespace cldnn;
namespace {
struct concat_noop_optimization : pattern_match_optimization_typed<concat_noop_optimization, concatenation> {
// Removes concatenation nodes with single input.
using base = pattern_match_optimization_typed<concat_noop_optimization, concatenation>;
using base::base;
bool match(concatenation_node& node);
bool optimize(concatenation_node& node);
};
struct concat_in_place_optimization : pattern_match_optimization_typed<concat_in_place_optimization, concatenation> {
// Performs in-place concat optimization.
// Padding of predecessors is updated to use single buffer by all, which is output from concatenation.
// Then concatenation can be optimized out, as memory will be correctly filled by previous nodes.
// If one of the dependencies is also optimized-out concatenation, then cascade adjusment is performed to update it.
// This optimization is expected to be executed in some topological order, as cascade adjustment is performed backwards.
using base = pattern_match_optimization_typed<concat_in_place_optimization, concatenation>;
using base::base;
// Runs concat in-place optimization and adds already optimized concatenations that need re-optimization to `needs_reoptimization`.
void optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization);
bool match(concatenation_node& node);
bool optimize(concatenation_node& node) {
std::list<concatenation_node*> need_reopt;
optimize_cascade(node, need_reopt);
while (!need_reopt.empty()) {
auto& prop = *need_reopt.front();
need_reopt.pop_front();
if (match(prop))
optimize_cascade(prop, need_reopt);
else
// TODO: Revert extra padding when cascade adjustment failed.
prop.can_be_optimized(false);
}
return false; // node not invalidated
}
};
namespace cldnn {
bool concat_noop_optimization::match(concatenation_node& node) {
if (node.is_output())
return false;
@ -82,19 +44,127 @@ bool concat_noop_optimization::optimize(concatenation_node& node) {
}
bool concat_in_place_optimization::match(concatenation_node& node) {
if (node.is_output())
return false;
if (node.has_fused_primitives())
return false;
if (node.is_dynamic())
return false;
std::vector<kernel_impl_params> pred_params;
for (auto pred : node.get_dependencies()) {
pred_params.push_back(*pred.first->get_kernel_impl_params());
}
return (match(node, *node.get_kernel_impl_params(), pred_params));
}
// reverted condition - if any of this node's inputs is used by more than one primitive
// and is not optimized concatenation then do not fuse buffers
// TODO: we need add padding support for all optimized kernels to remove this condition
auto available_pred = [](const program_node& input) {
if (!input.is_type<pooling>() && !input.is_type<convolution>() && !input.is_type<quantize>() &&
!input.is_type<activation>() && !input.is_type<deconvolution>() && !input.is_type<concatenation>() &&
!input.is_type<crop>() && !input.is_type<eltwise>() && !input.is_type<resample>() &&
!input.is_type<reorder>() && !(input.is_type<permute>() && !input.as<permute>().is_rotating_except_batch()))
return false;
return true;
};
bool concat_in_place_optimization::match(const program_node& concat_node,
kernel_impl_params concat_params,
std::vector<kernel_impl_params> pred_params,
bool is_runtime) {
if (concat_node.is_output() || concat_params.fused_desc.size() > 0)
return false;
auto pred_nodes = concat_node.get_dependencies();
for (auto p : pred_nodes) {
// TODO : In dynamic shape only one user is allowed for optimzied concat
// It is mainly because of the limited flexibility of current exec order
// For now, we are doing shape_infer for all pred nodes and concats when executing one of the predecessors for runtime buffer fusing
// So we need to ensure that shape_infer of the all the parents of other predecessors are done.
// We need to shuffle the exec order for that requirement, but currently only simple method is applied which is only applicable
// for simple patterns where the concat is the only user of all the preds.
// Also cascaded concat is not handled for dynamic shape. for now.
// If we have more flexible exec order handling in the future we'll be able to remove this condition below
if (p.first->is_dynamic() && p.first->get_users().size() > 1)
return false;
if (concat_node.is_dynamic() && !p.first->is_dynamic())
return false;
}
// if this is called in primitive_inst::execute() and concat is static, that concat should already be optimized in build time, not in runtime.
if (is_runtime && !concat_node.is_dynamic())
return false;
bool is_onednn_impl = false;
for (const auto& input : node.get_dependencies()) {
if (input.first->get_preferred_impl_type() == impl_types::onednn) {
for (const auto& fused_op : input.first->get_fused_primitives()) {
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(*input.first, fused_op);
// For in place concatenation input layouts and data types must match.
// Also, it checks whether data along f-axis is aligned properly for implicit concat.
// Otherwise, use explicit concat instead.
auto output_format = concat_params.get_output_layout().format;
auto output_datatype = concat_params.get_output_layout().data_type;
auto concat_axis = concat_params.typed_desc<concatenation>()->axis;
auto def_fmt = format::get_default_format(concat_params.get_output_layout().get_rank());
auto lower_padd_in_axis = concat_params.get_output_layout().data_padding.lower_size().sizes(def_fmt)[concat_axis];
lower_padd_in_axis = std::max(lower_padd_in_axis,
pred_params[0].get_output_layout().data_padding.lower_size().sizes(def_fmt)[concat_axis]);
size_t idx = 0;
for (auto pred : pred_nodes) {
if (!available_pred(*pred.first))
return false;
if (pred.first->is_output())
return false;
// if an input is marked as network output, prevent optimizations
// which would affect a form of its output (unless debug flag is set),
// we also need to restrict input types to those which support padding on all axis
if (pred.first->is_dynamic() && is_runtime) {
if (!pred.first->is_padding_supported(concat_axis, lower_padd_in_axis))
return false;
}
// TODO: handle optimized reshape
if (pred.first->is_type<reshape>() && pred.first->can_be_optimized())
return false;
// TODO: Investigate if this condition is needed
if (pred.first->get_users().size() > 2)
return false;
// Check that input isn't optimized out concatenation along different axis.
if (pred.first->is_type<concatenation>() && pred.first->can_be_optimized()) {
// cascaded concat opt is not supported for dynamic shape yet
if (concat_node.is_dynamic() || is_runtime)
return false;
else if (pred.first->as<concatenation>().get_primitive()->axis != concat_axis)
return false;
}
// Check that input isn't optimized out non-concatenation.
if (!pred.first->is_type<concatenation>() && pred.first->can_be_optimized())
return false;
size_t concat_users = 0;
for (auto& user : pred.first->get_users())
if (user->is_type<concatenation>())
concat_users += 1;
// If input is used by more than one concatenation then they may require different paddings.
if (concat_users != 1)
return false;
layout pred_l = pred_params[idx].get_output_layout();
if (output_format != pred_l.format || output_datatype != pred_l.data_type)
return false;
if (pred_l.format.block_sizes().size() > 1)
return false;
// TODO: Below condition should be moved to program_node::supports_padding.
// This however will need updating the algorithm as it may make cascade adjustment impossible in some cases.
// It however would make normal optimizations possible in others, so this is a trade-off to be investigated.
if (idx != concat_node.get_dependencies().size() - 1) {
if ((pred_l.format == format::b_fs_yx_fsv16 || pred_l.format == format::b_fs_zyx_fsv16) &&
(pred_l.feature() % 16 != 0 || concat_axis != 1))
return false;
if ((pred_l.format == format::b_fs_yx_fsv32 || pred_l.format == format::b_fs_zyx_fsv32) &&
(pred_l.feature() % 32 != 0 || concat_axis != 1))
return false;
if (pred_l.format == format::b_fs_yx_fsv4 && (pred_l.feature() != 4 || concat_axis != 1))
return false;
}
if (pred.first->get_preferred_impl_type() == impl_types::onednn) {
for (const auto& fused_op : pred_params[idx].fused_desc) {
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(*pred.first, fused_op);
if (add_type == add_fusing_type::sum)
return false;
else
@ -102,21 +172,42 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
}
// Optimized-out input node is no longer onednn impl.
if (!input.first->can_be_optimized())
if (!pred.first->can_be_optimized())
is_onednn_impl = true;
}
// If sibling is using onednn impl and batch > 1, the onednn impl cannot process the implicit concat'ed buffer.
// Onednn impls can process implicit concat'ed buffer only through buffer pointer manipulation.
if ((is_runtime && concat_params.get_output_layout().batch() > 1) ||
(!concat_node.is_dynamic() && concat_params.get_output_layout().batch() > 1)) {
for (auto& sib : pred.first->get_users()) {
if (sib->get_preferred_impl_type() == impl_types::onednn) {
return false;
}
}
}
auto input_padd = pred.first->get_output_layout().data_padding;
// Check that there isn't already some padding between inputs in concat axis.
// If node has already been optimized we skip this check - this is just cascade adjustment.
if (!concat_node.can_be_optimized()) {
if (idx != concat_node.get_dependencies().size() && input_padd.upper_size().sizes(def_fmt)[concat_axis] != 0)
return false;
if (idx != 0 && input_padd.lower_size().sizes(def_fmt)[concat_axis] != 0)
return false;
}
if (!concat_node.is_dynamic() || is_runtime)
lower_padd_in_axis += pred_params[idx].get_output_layout().get_tensor().sizes(def_fmt)[concat_axis];
idx++;
}
// Implicit concat for onednn only when use_usm and batch 1.
if (is_onednn_impl) {
bool use_usm = node.get_program().get_engine().use_unified_shared_memory();
layout out_l = node.get_output_layout();
bool use_usm = concat_node.get_program().get_engine().use_unified_shared_memory();
layout concat_out_l = concat_params.get_output_layout();
if (!use_usm)
return false;
if (out_l.batch() > 1)
if (concat_out_l.batch() > 1)
return false;
// TODO: cldnn cases should be updated. This logic is working for onednn only.
// white list for support fusing formats.
const std::vector<format> white_list = {
@ -128,140 +219,61 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
format::b_fs_zyx_fsv32,
format::b_fs_yx_fsv4,
};
if (std::find_if(white_list.begin(), white_list.end(), [&out_l](format fmt){ return (fmt == out_l.format); }) == std::end(white_list))
if (std::find_if(white_list.begin(), white_list.end(), [&concat_out_l](format fmt){ return (fmt == concat_out_l.format); }) == std::end(white_list))
return false;
}
// For in place concatenation input layouts and data types must match.
// Also, it checks whether data along f-axis is aligned properly for implicit concat.
// Otherwise, use explicit concat instead.
auto output_format = node.get_output_layout().format;
auto output_datatype = node.get_output_layout().data_type;
auto concat_axis = node.get_primitive()->axis;
auto def_fmt = format::get_default_format(node.get_output_layout().get_rank());
size_t idx = 0;
for (const auto& input : node.get_dependencies()) {
if (input.first->is_type<reshape>())
// reshapes should be optimized out.
return false;
layout l = input.first->get_output_layout();
if (output_format != l.format || output_datatype != l.data_type)
return false;
if (l.format.block_sizes().size() > 1)
return false;
// TODO: Below condition should be moved to program_node::supports_padding.
// This however will need updating the algorithm as it may make cascade adjustment impossible in some cases.
// It however would make normal optimizations possible in others, so this is a trade-off to be investigated.
if (idx != node.get_dependencies().size() - 1) {
if ((l.format == format::b_fs_yx_fsv16 || l.format == format::b_fs_zyx_fsv16) &&
(l.feature() % 16 != 0 || node.get_primitive()->axis != 1))
return false;
if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
(l.feature() % 32 != 0 || node.get_primitive()->axis != 1))
return false;
if (l.format == format::b_fs_yx_fsv4 && (l.feature() != 4 || node.get_primitive()->axis != 1))
return false;
}
idx++;
}
auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().sizes(def_fmt)[concat_axis];
lower_padd_in_axis = std::max(lower_padd_in_axis,
node.get_dependency(0).get_output_layout().data_padding.lower_size().sizes(def_fmt)[concat_axis]);
// check if concatenation in place can be applied for inputs set
idx = 0;
for (const auto& input : node.get_dependencies()) {
// reverted condition - if any of this node's inputs is used by more than one primitive
// and is not optimized concatenation then do not fuse buffers
// todo: we need add padding support for all optimized kernels to remove this condition
if (!input.first->is_type<pooling>() && !input.first->is_type<convolution>() && !input.first->is_type<quantize>() &&
!input.first->is_type<activation>() && !input.first->is_type<deconvolution>() &&
!input.first->is_type<concatenation>() && !input.first->is_type<crop>() && !input.first->is_type<eltwise>() &&
!input.first->is_type<resample>())
return false;
// if an input is marked as network output, prevent optimizations
// which would affect a form of its output (unless debug flag is set),
// we also need to restrict input types to those which support padding on all axis
if (input.first->is_output() || !input.first->is_padding_supported(concat_axis, lower_padd_in_axis))
return false;
// TODO: Investigate if this condition is needed
if (input.first->get_users().size() > 2)
return false;
// If sibling is using onednn impl and batch > 1, the onednn impl cannot process the implicit concat'ed buffer.
// Onednn impls can process implicit concat'ed buffer only through buffer pointer manipulation.
if (node.get_output_layout().batch() > 1) {
for (auto& sib : input.first->get_users()) {
if (sib->get_preferred_impl_type() == impl_types::onednn) {
return false;
}
}
}
// Check that input isn't optimized out concatenation along different axis.
if (input.first->is_type<concatenation>() && input.first->can_be_optimized() &&
input.first->as<concatenation>().get_primitive()->axis != concat_axis)
return false;
// Check that input isn't optimized out non-concatenation.
if (!input.first->is_type<concatenation>() && input.first->can_be_optimized())
return false;
size_t concat_users = 0;
for (auto& user : input.first->get_users())
if (user->is_type<concatenation>())
concat_users += 1;
// If input is used by more than one concatenation then they may require different paddings.
if (concat_users != 1)
return false;
auto input_padd = input.first->get_output_layout().data_padding;
// Check that there isn't already some padding between inputs in concat axis.
// If node has already been optimized we skip this check - this is just cascade adjustment.
if (!node.can_be_optimized()) {
if (idx != node.get_dependencies().size() && input_padd.upper_size().sizes(def_fmt)[concat_axis] != 0)
return false;
if (idx != 0 && input_padd.lower_size().sizes(def_fmt)[concat_axis] != 0)
return false;
}
lower_padd_in_axis += input.first->get_output_layout().get_tensor().sizes(def_fmt)[concat_axis];
idx += 1;
}
return true;
}
void concat_in_place_optimization::optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization) {
auto out_layout = node.get_output_layout();
auto out_rank = out_layout.get_rank();
auto concat_axis = node.get_primitive()->axis;
// We need to transform axis from bf[w][z]yx order to bfxy[z][w] due to tensor.sizes() usages here
std::vector<layout> preds_layouts;
for (auto dep : node.get_dependencies()) {
if (dep.first->is_type<concatenation>() && dep.first->can_be_optimized())
need_reoptimization.push_back(&dep.first->as<concatenation>());
preds_layouts.push_back(dep.first->get_output_layout());
}
layout concat_layout = node.get_output_layout();
update_in_place_concat_paddings(concat_layout, preds_layouts, node.get_primitive()->axis, false);
size_t i = 0;
for (auto& dep : node.get_dependencies()) {
dep.first->set_output_layout(preds_layouts[i]);
dep.first->can_share_buffer(false);
++i;
}
node.set_output_layout(concat_layout);
node.can_be_optimized(true);
}
void concat_in_place_optimization::update_in_place_concat_paddings(
layout& concat_out_layout,
std::vector<layout>& preds_layouts,
size_t concat_axis,
bool is_runtime) {
auto concat_out_rank = concat_out_layout.get_rank();
// We need to transform axis from bf[v][u][w][z]yx order to bfxy[z][w][u][v] due to tensor.sizes() usages here
// should be removed once pad representation is changed
auto concat_axis_legacy = concat_axis;
if (concat_axis_legacy >= 2) {
auto spatial_axis = concat_axis_legacy - 2;
// Default and minimum number of dimensions is 4
auto spatial_size = std::max<size_t>(out_rank, 4) - 2;
auto spatial_size = std::max<size_t>(concat_out_rank, 4) - 2;
concat_axis_legacy = spatial_size - spatial_axis - 1 + 2;
}
if (concat_out_layout.is_dynamic() && !is_runtime) {
// set dynamic pad dims for shape agnostic kernel
for (auto& dep_output_layout : preds_layouts) {
auto info_dynamic_pad = tensor(0).sizes();
info_dynamic_pad[concat_axis_legacy] = 1;
dep_output_layout.data_padding.set_dynamic_pad(tensor(info_dynamic_pad));
}
return;
}
// Select output padding by propagating all required input paddings.
auto padd = out_layout.data_padding;
for (auto input : node.get_dependencies()) {
auto inputPadding = input.first->get_output_layout().data_padding;
auto padd = concat_out_layout.data_padding;
for (auto input : preds_layouts) {
auto inputPadding = input.data_padding;
padd = padding::max(padd, inputPadding);
}
@ -270,21 +282,17 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st
// For cascade adjustment override padding in concat axis to output padding.
// In other case match(...) already checked that only first/last input have lower/upper padding.
if (node.can_be_optimized()) {
lower_padd[concat_axis_legacy] = out_layout.data_padding.lower_size().sizes()[concat_axis_legacy];
upper_padd[concat_axis_legacy] = out_layout.data_padding.upper_size().sizes()[concat_axis_legacy];
}
node.set_output_padding(padding(lower_padd, upper_padd));
lower_padd[concat_axis_legacy] = concat_out_layout.data_padding.lower_size().sizes()[concat_axis_legacy];
upper_padd[concat_axis_legacy] = concat_out_layout.data_padding.upper_size().sizes()[concat_axis_legacy];
auto dyn_pad_dims = lower_padd;
dyn_pad_dims[concat_axis_legacy] = 1;
concat_out_layout.data_padding = padding(lower_padd, upper_padd);
upper_padd[concat_axis_legacy] += out_layout.get_dims()[concat_axis];
// apply concatenation in place optimization
for (const auto& input : node.get_dependencies()) {
auto input_length = input.first->get_output_layout().get_dims()[concat_axis];
if (input.first->is_type<concatenation>() && input.first->can_be_optimized())
need_reoptimization.push_back(&input.first->as<concatenation>());
upper_padd[concat_axis_legacy] += concat_out_layout.get_dims()[concat_axis];
// apply concatenation in place optimization
for (auto& pred_layout : preds_layouts) {
auto input_length = pred_layout.get_dims()[concat_axis];
// shrink upper pad so it points at the end of the input's buffer
//
// |--- lower padd ---| |---------- upper padd -----------|
@ -292,22 +300,18 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st
upper_padd[concat_axis_legacy] -= input_length;
// set new padding for input
input.first->set_output_padding(padding(lower_padd, upper_padd));
if (is_runtime)
pred_layout.data_padding = padding(lower_padd, upper_padd, 0.f, tensor(dyn_pad_dims));
else
pred_layout.data_padding = padding(lower_padd, upper_padd, 0.f);
// move lower padd further
//
// |-------------- lower padd -------------|---------- upper padd -----------|
// |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
lower_padd[concat_axis_legacy] += input_length;
}
node.can_be_optimized(true);
for (auto dep : node.get_users()) {
dep->can_share_buffer(false);
}
}
} // namespace
} // namespace cldnn
static bool can_reshape_be_optimized(const reshape_node& node) {
return node.is_in_place() && !node.has_fused_primitives();

View File

@ -0,0 +1,62 @@
// copyright (c) 2023 intel corporation
// spdx-license-identifier: apache-2.0
//
#include "pass_manager.h"
#include "program_helpers.h"
#include "concatenation_inst.h"
#include <utility>
#include <list>
#include <vector>
using namespace cldnn;
namespace cldnn {
struct concat_noop_optimization : pattern_match_optimization_typed<concat_noop_optimization, concatenation> {
// Removes concatenation nodes with single input.
using base = pattern_match_optimization_typed<concat_noop_optimization, concatenation>;
using base::base;
bool match(concatenation_node& node);
bool optimize(concatenation_node& node);
};
struct concat_in_place_optimization : pattern_match_optimization_typed<concat_in_place_optimization, concatenation> {
// Performs in-place concat optimization.
// Padding of predecessors is updated to use single buffer by all, which is output from concatenation.
// Then concatenation can be optimized out, as memory will be correctly filled by previous nodes.
// If one of the dependencies is also optimized-out concatenation, then cascade adjusment is performed to update it.
// This optimization is expected to be executed in some topological order, as cascade adjustment is performed backwards.
using base = pattern_match_optimization_typed<concat_in_place_optimization, concatenation>;
using base::base;
// Runs concat in-place optimization and adds already optimized concatenations that need re-optimization to
// `needs_reoptimization`.
void optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization);
static void update_in_place_concat_paddings(layout& concat_layout,
std::vector<layout>& preds_layouts,
size_t concat_axis,
bool is_runtime);
bool match(concatenation_node& node);
static bool match(const program_node& concat_node,
kernel_impl_params concat_params,
std::vector<kernel_impl_params> pred_params,
bool is_runtime = false);
bool optimize(concatenation_node& node) {
std::list<concatenation_node*> need_reopt;
optimize_cascade(node, need_reopt);
while (!need_reopt.empty()) {
auto& prop = *need_reopt.front();
need_reopt.pop_front();
if (match(prop))
optimize_cascade(prop, need_reopt);
else
// TODO: Revert extra padding when cascade adjustment failed.
prop.can_be_optimized(false);
}
return false; // node not invalidated
}
};
} // namespace cldnn

View File

@ -156,6 +156,13 @@ public:
virtual void set_output_memory(memory::ptr mem, bool check = true, size_t idx = 0);
void check_memory_to_set(const memory& mem, const layout& layout) const;
const std::list<const cldnn::program_node *>& get_users() const { return _node->get_users(); }
std::vector<std::shared_ptr<primitive_inst>> get_user_insts() const {
std::vector<primitive_id> users;
for (auto u : get_users()) {
users.push_back(u->id());
}
return _network.get_primitives(users);
}
const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
@ -199,7 +206,7 @@ public:
void set_shape_change() { _shape_changed = true; }
void build_deps();
void do_runtime_in_place_concat();
memory::ptr fused_memory(size_t dep_id) const {
return dep_memory_ptr(get_fused_mem_offset() + dep_id);
}
@ -220,7 +227,7 @@ public:
void allocate_internal_buffers();
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true);
const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false);
std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }
@ -257,6 +264,7 @@ protected:
program_node const* _node;
layout _node_output_layout;
bool update_shape_done_by_other = false;
std::unique_ptr<kernel_impl_params> _impl_params;
std::unique_ptr<primitive_impl> _impl;
std::unique_ptr<primitive_impl> _dynamic_impl = nullptr;
@ -317,7 +325,7 @@ protected:
size_t max_output_layout_size = 0;
std::vector<size_t> max_intermediates_memory_sizes;
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true);
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true, bool runtime_alloc = false);
memory::ptr allocate_internal_buffer(size_t idx);
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);

View File

@ -1083,9 +1083,35 @@ void network::build_insts_deps() {
void network::build_exec_order() {
GPU_DEBUG_DEFINE_MEM_LOGGER("build_exec_order");
for (auto& node : _program->get_processing_order()) {
if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty())) {
add_to_exec_order(node->id());
if (!_is_dynamic) {
for (auto& node : _program->get_processing_order()) {
if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty())) {
add_to_exec_order(node->id());
}
}
} else {
auto is_runtime_optimized_concat = [&](const program_node* node) {
return (node->is_dynamic() && node->is_type<concatenation>() && node->can_be_optimized());
};
auto is_allowed_pred_for_runtime_optimized_concat = [&](const program_node* node) {
return (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty()) &&
node->get_users().size() == 1 && is_runtime_optimized_concat(node->get_users().front()));
};
for (auto& node : _program->get_processing_order()) {
if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty())) {
if (is_allowed_pred_for_runtime_optimized_concat(node)) {
continue;
} else if (is_runtime_optimized_concat(node)) {
// For in-place concat applied at runtime, we need to do update_shape for all other predecessors of the concat user.
// i.e., We need to make sure that all the preds of them are already updated too.
for (auto dep : node->get_dependencies()) {
if (!dep.first->is_type<data>()) {
add_to_exec_order(dep.first->id());
}
}
}
add_to_exec_order(node->id());
}
}
}
}

View File

@ -1,7 +1,7 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "program_helpers.h"
#include "primitive_inst.h"
#include "data_inst.h"
#include "mutable_data_inst.h"
@ -11,6 +11,10 @@
#include "fully_connected_inst.h"
#include "convolution_inst.h"
#include "crop_inst.h"
#include "pooling_inst.h"
#include "permute_inst.h"
#include "resample_inst.h"
#include "reshape_inst.h"
#include "eltwise_inst.h"
#include "deconvolution_inst.h"
#include "shape_of_inst.h"
@ -18,6 +22,7 @@
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
#include "compilation_context.hpp"
#include "implementation_map.hpp"
#include "graph_optimizer/prepare_buffer_fusing.h"
#include "intel_gpu/plugin/common_utils.hpp"
#include "intel_gpu/graph/network.hpp"
@ -41,7 +46,8 @@
namespace cldnn {
namespace {
bool is_optimized_output_user(const program_node* user) {
template <typename T>
bool is_optimized_output_user(const T user) {
if (user->can_be_optimized()) {
if (user->is_output())
return true;
@ -56,18 +62,25 @@ bool is_optimized_output_user(const program_node* user) {
}
return false;
}
bool is_output_buffer(const program_node& node) {
if (node.is_output())
bool is_output_buffer(const primitive_inst* prim, bool runtime_alloc) {
if (prim->is_output())
return true;
// Try to recursively find any optimized out user which is also network output
for (const auto& user : node.get_users()) {
if (is_optimized_output_user(user)) {
return true;
if (runtime_alloc) {
// Try to recursively find any optimized out user which is also network output
for (const auto& user : prim->get_user_insts()) {
if (is_optimized_output_user<const std::shared_ptr<primitive_inst>>(user)) {
return true;
}
}
} else {
for (const auto& user : prim->get_node().get_users()) {
if (is_optimized_output_user<const program_node*>(user)) {
return true;
}
}
}
return false;
}
@ -179,7 +192,12 @@ void primitive_inst::set_output_memory(memory::ptr mem_new, bool check, size_t i
void primitive_inst::update_shape() {
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
if (update_shape_done_by_other) {
update_shape_done_by_other = false; // reset
GPU_DEBUG_TRACE_DETAIL << id() << ": update shape is done by other: "
<< _impl_params->output_layouts[0].to_short_string() << std::endl;
return;
}
bool input_shape_changed = false;
for (size_t i = 0; i < _deps.size(); i++) {
auto idx = _deps[i].second;
@ -279,6 +297,15 @@ event::ptr primitive_inst::realloc_if_needed() {
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
event::ptr ev = nullptr;
if (_node->get_users().size() == 1 && _node->get_users().front()->is_type<concatenation>()) {
auto concat_inst = _network.get_primitive(get_users().front()->id());
if (concat_inst->can_be_optimized()) {
concat_inst->realloc_if_needed();
this->_outputs[0] = concat_inst->_outputs[0];
GPU_DEBUG_TRACE_DETAIL << id() << ": use concat user's memory " << this->_outputs[0]->buffer_ptr() << std::endl;
return ev;
}
}
// Update param if fake_alignment is available
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
auto actual_layout = updated_params.get_output_layout();
@ -292,7 +319,8 @@ event::ptr primitive_inst::realloc_if_needed() {
if (can_reuse_buffer) {
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
_outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
if (_outputs[0]->get_layout() != actual_layout)
_outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
if (need_reset_output_memory()) {
ev = _outputs[0]->fill(_network.get_stream());
}
@ -300,7 +328,7 @@ event::ptr primitive_inst::realloc_if_needed() {
GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
<< " Current buffer_size=" << max_output_layout_size
<< " Requested buffer_size=" << actual_layout.count() << std::endl;
_outputs = allocate_outputs(&updated_params, need_reset_output_memory());
_outputs = allocate_outputs(&updated_params, need_reset_output_memory(), true);
// TODO : need to handle multiple outputs
max_output_layout_size = updated_params.output_layouts[0].count();
}
@ -385,14 +413,14 @@ bool primitive_inst::update_impl() {
auto data_padding = params.output_layouts[i].data_padding;
for (size_t j = 0; j < output_shape_max_rank.size(); j++) {
if (is_dynamic_pad[j] == 1) {
GPU_DEBUG_TRACE_DETAIL
<< " shape_info[" << offset << "] = " << data_padding.lower_size().sizes()[j]
<< "(pad_before for output[" << i << "] " << j << "-th dim)" << std::endl;
lock[offset++] = data_padding.lower_size().sizes()[j]; // pad_before
GPU_DEBUG_TRACE_DETAIL << " shape_info[" << offset
<< "] = " << data_padding.lower_size().sizes()[j]
auto lower_pads = data_padding.lower_size().sizes(format::get_default_format(layout::max_rank()));
GPU_DEBUG_TRACE_DETAIL << " shape_info[" << offset << "] = " << lower_pads[j]
<< "(pad_before for output[" << i << "] " << j << "-th dim)" << std::endl;
lock[offset++] = lower_pads[j];
auto upper_pads = data_padding.upper_size().sizes(format::get_default_format(layout::max_rank()));
GPU_DEBUG_TRACE_DETAIL << " shape_info[" << offset << "] = " << upper_pads[j]
<< "(pad_after for output[" << i << "] " << j << "-th dim)" << std::endl;
lock[offset++] = data_padding.upper_size().sizes()[j]; // pad_after
lock[offset++] = upper_pads[j]; // pad_after
}
}
}
@ -415,10 +443,18 @@ bool primitive_inst::update_impl() {
updated_params.weights_layout = optional_layout(original_weights_memory->get_layout());
}
auto updated_params_no_dyn_pad = updated_params;
for (auto& i : updated_params_no_dyn_pad.input_layouts) {
i.data_padding.set_dynamic_pad(tensor(0));
}
for (auto& o : updated_params_no_dyn_pad.output_layouts) {
o.data_padding.set_dynamic_pad(tensor(0));
}
auto& cache = get_network().get_program()->get_implementations_cache();
std::shared_ptr<primitive_impl> cached_impl = nullptr;
{
cached_impl = cache.get(updated_params);
cached_impl = cache.get(updated_params_no_dyn_pad);
if (cached_impl) {
_impl = cached_impl->clone();
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
@ -431,13 +467,6 @@ bool primitive_inst::update_impl() {
if (!cached_impl) {
if (_dynamic_impl) {
auto& compilation_context = get_network().get_program()->get_compilation_context();
auto updated_params_no_dyn_pad = updated_params;
for (auto& i : updated_params_no_dyn_pad.input_layouts) {
i.data_padding.set_dynamic_pad(tensor(0));
}
for (auto& o : updated_params_no_dyn_pad.output_layouts) {
o.data_padding.set_dynamic_pad(tensor(0));
}
compilation_context.push_task(updated_params_no_dyn_pad.hash(), [this, &compilation_context, updated_params_no_dyn_pad]() {
if (compilation_context.is_stopped())
return;
@ -454,25 +483,23 @@ bool primitive_inst::update_impl() {
if (!can_be_optimized()) {
auto kernels = _program->get_kernels_cache().compile(updated_params_no_dyn_pad, impl->get_kernels_source());
impl->set_kernels(kernels);
cache.add(updated_params_no_dyn_pad, impl->clone());
}
cache.add(updated_params_no_dyn_pad, impl->clone());
});
if (!can_be_optimized()) {
_impl = _dynamic_impl->clone();
auto new_impl_params = _impl->canonicalize_shapes(*_impl_params);
_impl->update_dispatch_data(new_impl_params);
update_shape_info(new_impl_params);
}
} else {
_impl = _node->type()->choose_impl(*_node, updated_params);
_impl = _node->type()->choose_impl(*_node, updated_params_no_dyn_pad);
if (!can_be_optimized()) {
auto& kernels_cache = get_network().get_program()->get_kernels_cache();
auto kernels = kernels_cache.compile(updated_params, _impl->get_kernels_source());
auto kernels = kernels_cache.compile(updated_params_no_dyn_pad, _impl->get_kernels_source());
_impl->set_kernels(kernels);
cache.add(updated_params_no_dyn_pad, _impl->clone());
}
cache.add(updated_params, _impl->clone());
auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
GPU_DEBUG_TRACE_DETAIL << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
}
@ -484,6 +511,64 @@ bool primitive_inst::update_impl() {
return true;
}
void primitive_inst::do_runtime_in_place_concat() {
if (update_shape_done_by_other)
return;
if (get_users().size() != 1) return;
auto concat_inst = _network.get_primitive(get_users().front()->id());
if (!concat_inst->get_node().is_type<concatenation>() || !concat_inst->get_node().can_be_optimized())
return;
// Currently does not support cascaded concats
std::vector<std::shared_ptr<primitive_inst>> concat_preds;
for (auto pred : concat_inst->_deps) {
concat_preds.push_back(pred.first);
}
GPU_DEBUG_TRACE_DETAIL << "[In place concat] Preparing for runtime buffer fusing" << std::endl;
// Do shape_infer for all concat's preds and concat
for (auto pred : concat_preds) {
if (!pred->update_shape_done_by_other) {
GPU_DEBUG_TRACE_DETAIL << "[In place concat] update shape for " << pred->id() << std::endl;
pred->update_shape();
pred->update_shape_done_by_other = true;
}
}
GPU_DEBUG_TRACE_DETAIL << "[In place concat] update shape for " << concat_inst->id() << std::endl;
concat_inst->update_shape();
concat_inst->update_shape_done_by_other = true;
layout concat_layout = concat_inst->_impl_params->get_output_layout();
std::vector<kernel_impl_params> pred_params;
std::vector<layout> preds_layouts;
for (auto pred : concat_inst->_deps) {
pred_params.push_back(*pred.first->_impl_params);
preds_layouts.push_back(pred.first->_impl_params->get_output_layout());
}
if (!concat_in_place_optimization::match(concat_inst->get_node(), *concat_inst->_impl_params, pred_params, true)) {
concat_inst->_can_be_optimized = false;
GPU_DEBUG_TRACE_DETAIL << "[In place concat] " << concat_inst->id() << " cannot be optimized " << std::endl;
return;
}
auto concat_axis = concat_inst->_impl_params->typed_desc<concatenation>()->axis;
concat_in_place_optimization::update_in_place_concat_paddings(concat_layout, preds_layouts, concat_axis, true);
size_t i = 0;
for (auto& dep : concat_inst->_deps) {
if (_impl_params->output_layouts[0] != preds_layouts[i]) {
dep.first->set_shape_change();
dep.first->_impl_params->output_layouts[0] = preds_layouts[i];
}
GPU_DEBUG_TRACE_DETAIL << "[In place concat] Update padding of pred " << i << " : "
<< dep.first->_impl_params->output_layouts[0].to_string() << std::endl;
++i;
}
concat_inst->_impl_params->output_layouts[0] = concat_layout;
concat_inst->_can_be_optimized = true;
GPU_DEBUG_TRACE_DETAIL << "[In place concat] " << concat_inst->id() << ": can_be_optimized " << std::endl;
}
event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
const auto primitive_id = id();
OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input");
@ -491,10 +576,13 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
std::vector<event::ptr> dependencies;
if (is_dynamic()) {
do_runtime_in_place_concat();
OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null");
update_shape();
if (_impl_params->output_layouts[0].bytes_count() == 0) {
if (_impl_params->output_layouts[0].count() == 0) {
GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl;
auto ev = get_network().get_stream().create_user_event(true);
update_shape_done_by_other = false; // reset
return ev;
}
@ -539,7 +627,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
}
}
}
update_shape_done_by_other = false; // reset
OPENVINO_ASSERT(_impl_params->get_output_layout().is_static(),
"[GPU] Can't execute ", primitive_id, " primitive as output layout is dynamic in runtime");
@ -914,7 +1002,7 @@ static bool user_requesting_mem_reuse_false(const program_node& node) {
}
memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params,
uint32_t net_id, bool is_internal, size_t idx, bool reset) {
uint32_t net_id, bool is_internal, size_t idx, bool reset, bool is_output_buffer) {
auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
allocation_type type, bool reusable, bool reset = true) {
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
@ -925,7 +1013,6 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
return pool.get_memory(static_layout, type, reset);
};
auto layout = impl_params.get_output_layout(idx);
OPENVINO_ASSERT(layout.is_static() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout");
auto device_mem_acc = [&](size_t a, const cldnn::layout& l) {
@ -942,16 +1029,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
if (total_device_input_mem_size > _engine.get_device_info().max_global_mem_size)
usm_device_allocatable = false;
bool memory_reuse_by_user = true;
if (user_requesting_mem_reuse_false(_node)) {
memory_reuse_by_user = false;
}
bool memory_reuse_by_user = !user_requesting_mem_reuse_false(_node);
// For outputs, cpu prim we want to have lockable alloc type
// Also if the successor of a node is an cpu, then memory needs to be lockable.
bool is_cpu = _node.get_selected_impl() ? _node.get_selected_impl()->is_cpu() : false;
auto use_lockable_memory = is_output_buffer(_node) || is_cpu || is_any_user_cpu(_node.get_users()) ||
auto use_lockable_memory = is_output_buffer|| is_cpu || is_any_user_cpu(_node.get_users()) ||
!_engine.supports_allocation(allocation_type::usm_device) ||
(_node.is_shape_infer_dep() && _engine.get_device_info().dev_type == device_type::integrated_gpu);
const auto& lockable_mem_type = _engine.get_lockable_preferred_memory_allocation_type(layout.format.is_image_2d());
@ -959,47 +1042,46 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
auto alloc_type = use_lockable_memory ? lockable_mem_type
: !usm_device_allocatable ? lockable_mem_type : allocation_type::usm_device;
if ((is_internal && (_node.can_be_optimized() || _node.is_type<generic_layer>())) || (memory_reuse_by_user == false)) {
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
// Use usm_device memory for weights reordering
if (is_internal && _node.is_type<generic_layer>() && _engine.supports_allocation(allocation_type::usm_device))
alloc_type = allocation_type::usm_device;
return get_memory_from_pool(_engine,
layout,
_node.id(),
_node.get_memory_dependencies(),
alloc_type,
false,
reset);
} else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
_engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
return _engine.allocate_memory(layout, allocation_type::usm_device, false);
} else if (is_internal && !_node.is_output() && _node.is_type<input_layout>()) {
// Skip memory reset for input_layout primitives, since data will be copied from cldnn::data primitive
// or just reuse primitive's memory
GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
return _engine.allocate_memory(layout, alloc_type, false);
} else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
if (is_internal) {
if (_node.can_be_optimized() || _node.is_type<generic_layer>()) {
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
// Use usm_device memory for weights reordering
if (is_internal && _node.is_type<generic_layer>() &&
_engine.supports_allocation(allocation_type::usm_device))
alloc_type = allocation_type::usm_device;
return get_memory_from_pool(_engine,
layout,
_node.id(),
_node.get_memory_dependencies(),
alloc_type,
false,
reset);
} else {
if ((_node.is_output() && _node.is_type<generic_layer>()) || (!_node.is_output() && _node.is_type<input_layout>()))
reset = false;
GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
return _engine.allocate_memory(layout, alloc_type, reset);
}
} else if (!_node.can_share_buffer() || _node.can_be_optimized() || _node.is_output()) {
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
return _engine.allocate_memory(layout, alloc_type, reset);
} else {
return get_memory_from_pool(_engine,
layout,
_node.id(),
_node.get_memory_dependencies(),
alloc_type,
true,
reset);
layout,
_node.id(),
_node.get_memory_dependencies(),
alloc_type,
memory_reuse_by_user,
reset);
}
}
std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params, bool reset_mem) {
std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params, bool reset_mem, bool runtime_alloc) {
std::vector<memory::ptr> outputs;
for (size_t i = 0; i < get_node().get_outputs_count() ; ++i) {
outputs.push_back(allocate_output(get_network().get_engine(), _network.get_memory_pool(),
*_node, (updated_params != nullptr) ? *updated_params : *_impl_params,
get_network_id(), _network.is_internal(), i, reset_mem));
get_network_id(), _network.is_internal(), i, reset_mem, is_output_buffer(this, runtime_alloc)));
}
return outputs;
}

View File

@ -1598,7 +1598,15 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
} else if (node->is_type<mutable_data>() && node->get_dependencies().empty()) {
continue;
} else {
allocated_mem_ptrs.insert(primitive_inst::allocate_output(engine, pool, *node, *node->get_kernel_impl_params(), 0, false));
allocated_mem_ptrs.insert(primitive_inst::allocate_output(engine,
pool,
*node,
*node->get_kernel_impl_params(),
0,
false,
0,
false,
node->is_output()));
}
}

View File

@ -152,3 +152,107 @@ TEST(prepare_buffer_fusing, propagate_data_padding) {
ASSERT_EQ(output_ptr[i], input_ptr[i]);
}
}
TEST(prepare_buffer_fusing, in_place_concat_static) {
auto& engine = get_test_engine();
auto in_layout1 = layout{ ov::PartialShape{1, 2, 3, 4}, data_types::f32, format::bfyx }; // => {1, 4, 3, 2}
auto in_layout2 = layout{ ov::PartialShape{1, 2, 4, 1}, data_types::f32, format::bfyx }; // => {1, 4, 1, 2}
topology topology;
topology.add(input_layout("input1", in_layout1));
topology.add(input_layout("input2", in_layout2));
topology.add(permute("permute1", input_info("input1"), {0, 3, 2, 1}));
topology.add(permute("permute2", input_info("input2"), {3, 2, 0, 1}));
topology.add(concatenation("concat", { input_info("permute1"), input_info("permute2") }, 2));
topology.add(permute("output", input_info("concat"), {0, 2, 3, 1}));
ExecutionConfig config;
config.set_property(ov::intel_gpu::optimize_data(true));
auto prog = program::build_program(engine, topology, config, false, false);
ASSERT_NE(prog, nullptr);
cldnn::network net(prog, 0);
auto input_memory1 = engine.allocate_memory(in_layout1);
auto input_memory2 = engine.allocate_memory(in_layout2);
set_values<float>(input_memory1,
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 11.0, 22.0, 33.0, 44.0, 55.0, 66.0,
111.0, 222.0, 333.0, 444.0, 555.0, 666.0, 1111.0, 2222.0, 3333.0, 4444.0, 5555.0, 6666.0});
set_values<float>(input_memory2, {1234.0, 2345.0, 3456.0, 4567.0, 5678.0, 6789.0, 9012.0, 9999.0});
net.set_input_data("input1", input_memory1);
net.set_input_data("input2", input_memory2);
std::map<cldnn::primitive_id, cldnn::network_output> output;
EXPECT_NO_THROW(output = net.execute());
const auto& concat_node = net.get_primitive("concat")->get_node();
auto concat_mem = net.get_primitive("concat")->output_memory_ptr();
auto permute1_mem = net.get_primitive("permute1")->output_memory_ptr();
auto permute2_mem = net.get_primitive("permute1")->output_memory_ptr();
ASSERT_TRUE(concat_node.can_be_optimized());
ASSERT_EQ(concat_mem, permute1_mem);
ASSERT_EQ(concat_mem, permute2_mem);
auto out_lay = net.get_output_layout("output");
auto out_mem = output.at("output").get_memory();
cldnn::mem_lock<float> output_ptr(out_mem, get_test_stream());
std::vector<float> ref_output = {1.0, 2.0, 3.0, 4.0, 111.0, 222.0, 333.0, 444.0, 5.0, 6.0, 11.0,
22.0, 555.0, 666.0, 1111.0, 2222.0, 33.0, 44.0, 55.0, 66.0, 3333.0, 4444.0,
5555.0, 6666.0, 1234.0, 2345.0, 3456.0, 4567.0, 5678.0, 6789.0, 9012.0, 9999.0};
for (size_t x = 0; x < out_lay.count(); ++x) {
ASSERT_EQ(ref_output[x], output_ptr[x]);
}
}
TEST(prepare_buffer_fusing, in_place_concat_dynamic) {
auto& engine = get_test_engine();
auto in_layout1_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
auto in_layout2_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
auto in_layout1 = layout{ ov::PartialShape{1, 2, 3, 4}, data_types::f32, format::bfyx };
auto in_layout2 = layout{ ov::PartialShape{1, 2, 4, 1}, data_types::f32, format::bfyx };
topology topology;
topology.add(input_layout("input1", in_layout1_0));
topology.add(input_layout("input2", in_layout2_0));
topology.add(permute("permute1", input_info("input1"), {0, 3, 2, 1}));
topology.add(permute("permute2", input_info("input2"), {3, 2, 0, 1}));
topology.add(concatenation("concat", { input_info("permute1"), input_info("permute2") }, 2));
topology.add(permute("output", input_info("concat"), {0, 2, 3, 1}));
ExecutionConfig config;
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto prog = program::build_program(engine, topology, config, false, false);
ASSERT_NE(prog, nullptr);
cldnn::network net(prog, 0);
auto input_memory1 = engine.allocate_memory(in_layout1);
auto input_memory2 = engine.allocate_memory(in_layout2);
set_values<float>(input_memory1,
{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 11.0, 22.0, 33.0, 44.0, 55.0, 66.0,
111.0, 222.0, 333.0, 444.0, 555.0, 666.0, 1111.0, 2222.0, 3333.0, 4444.0, 5555.0, 6666.0});
set_values<float>(input_memory2, {1234.0, 2345.0, 3456.0, 4567.0, 5678.0, 6789.0, 9012.0, 9999.0});
net.set_input_data("input1", input_memory1);
net.set_input_data("input2", input_memory2);
std::vector<float> ref_output = {1.0, 2.0, 3.0, 4.0, 111.0, 222.0, 333.0, 444.0, 5.0, 6.0, 11.0,
22.0, 555.0, 666.0, 1111.0, 2222.0, 33.0, 44.0, 55.0, 66.0, 3333.0, 4444.0,
5555.0, 6666.0, 1234.0, 2345.0, 3456.0, 4567.0, 5678.0, 6789.0, 9012.0, 9999.0};
std::map<cldnn::primitive_id, cldnn::network_output> output;
EXPECT_NO_THROW(output = net.execute());
auto out_l = net.get_output_layout("output");
auto out_mem = output.at("output").get_memory();
cldnn::mem_lock<float> output_ptr(out_mem, get_test_stream());
const auto& concat_node = net.get_primitive("concat")->get_node();
auto concat_mem = net.get_primitive("concat")->output_memory_ptr();
auto permute1_mem = net.get_primitive("permute1")->output_memory_ptr();
auto permute2_mem = net.get_primitive("permute1")->output_memory_ptr();
ASSERT_TRUE(concat_node.can_be_optimized());
ASSERT_EQ(concat_mem.get(), permute1_mem.get());
ASSERT_EQ(concat_mem.get(), permute2_mem.get());
for (size_t x = 0; x < out_l.count(); ++x) {
ASSERT_EQ(ref_output[x], output_ptr[x]);
}
}