[GPU] Enable runtime buffer fusing for dynamic shape (#17668)

* Initial impl for runtime buffer fusing Passing unittest with static kernel * pass unittest with dynamic impl * Refactor allocate_output * Separate header of buffer fusing * Refactored buffer fusing :: matcher/optimize * More cleanup * Fix crash in dolly * Reset can_be_optimized of primitive_inst when it is not * Fix empty tensor : Primitive with empty data should be skipped * Fix issue in dynamic padding : Static kernel should not contain dynamic padding dims Fix missing reset of update_shape_done_by_other flag * Not to add cache with emtpy kernel for optimized out inst * Fix corner case error in buffer fusing - Shapes of some preds may not be changed, but still needed to do update_impl because 1) paddings are changed 2) output memory should be updated - optimizable impl should not be added to the cache * Allowing reorder & permute_ref to be optimized concat predecessor * Some more fixes : runtime buffer fusing is available only when all preds/concat are dynamic runtime buffer fusing is to be executed only if the node is dynamic * Fix allocate_output parameter called by get_estimated_device_mem_usage according to the new change * Fixed error in cascaded concatt * Need to reinterprete even though the size is same
2023-06-02 12:39:28 -07:00 · 2023-06-02 12:39:28 -07:00 · f670dc5a0d
commit f670dc5a0d
parent c3a54b0a6e
7 changed files with 571 additions and 277 deletions
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@ -1,7 +1,7 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
+#include "prepare_buffer_fusing.h"
 #include "pooling_inst.h"
 #include "primitive_inst.h"
 #include "activation_inst.h"
@ -25,45 +25,7 @@

 using namespace cldnn;

-namespace {
-
-struct concat_noop_optimization : pattern_match_optimization_typed<concat_noop_optimization, concatenation> {
-    // Removes concatenation nodes with single input.
-    using base = pattern_match_optimization_typed<concat_noop_optimization, concatenation>;
-    using base::base;
-
-    bool match(concatenation_node& node);
-    bool optimize(concatenation_node& node);
-};
-
-struct concat_in_place_optimization : pattern_match_optimization_typed<concat_in_place_optimization, concatenation> {
-    // Performs in-place concat optimization.
-    // Padding of predecessors is updated to use single buffer by all, which is output from concatenation.
-    // Then concatenation can be optimized out, as memory will be correctly filled by previous nodes.
-    // If one of the dependencies is also optimized-out concatenation, then cascade adjusment is performed to update it.
-    // This optimization is expected to be executed in some topological order, as cascade adjustment is performed backwards.
-    using base = pattern_match_optimization_typed<concat_in_place_optimization, concatenation>;
-    using base::base;
-
-    // Runs concat in-place optimization and adds already optimized concatenations that need re-optimization to `needs_reoptimization`.
-    void optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization);
-    bool match(concatenation_node& node);
-    bool optimize(concatenation_node& node) {
-        std::list<concatenation_node*> need_reopt;
-        optimize_cascade(node, need_reopt);
-        while (!need_reopt.empty()) {
-            auto& prop = *need_reopt.front();
-            need_reopt.pop_front();
-            if (match(prop))
-                optimize_cascade(prop, need_reopt);
-            else
-                // TODO: Revert extra padding when cascade adjustment failed.
-                prop.can_be_optimized(false);
-        }
-        return false;  // node not invalidated
-    }
-};
-
+namespace cldnn {
 bool concat_noop_optimization::match(concatenation_node& node) {
    if (node.is_output())
        return false;
@ -82,19 +44,127 @@ bool concat_noop_optimization::optimize(concatenation_node& node) {
 }

 bool concat_in_place_optimization::match(concatenation_node& node) {
-    if (node.is_output())
-        return false;
-    if (node.has_fused_primitives())
-        return false;
-    if (node.is_dynamic())
-        return false;
+    std::vector<kernel_impl_params> pred_params;
+    for (auto pred : node.get_dependencies()) {
+        pred_params.push_back(*pred.first->get_kernel_impl_params());
+    }
+    return (match(node, *node.get_kernel_impl_params(), pred_params));
+}

+// reverted condition - if any of this node's inputs is used by more than one primitive
+// and is not optimized concatenation then do not fuse buffers
+// TODO: we need add padding support for all optimized kernels to remove this condition
+auto available_pred = [](const program_node& input) {
+    if (!input.is_type<pooling>() && !input.is_type<convolution>() && !input.is_type<quantize>() &&
+        !input.is_type<activation>() && !input.is_type<deconvolution>() && !input.is_type<concatenation>() &&
+        !input.is_type<crop>() && !input.is_type<eltwise>() && !input.is_type<resample>() &&
+        !input.is_type<reorder>() && !(input.is_type<permute>() && !input.as<permute>().is_rotating_except_batch()))
+        return false;
+    return true;
+};
+
+bool concat_in_place_optimization::match(const program_node& concat_node,
+                                         kernel_impl_params concat_params,
+                                         std::vector<kernel_impl_params> pred_params,
+                                         bool is_runtime) {
+    if (concat_node.is_output() || concat_params.fused_desc.size() > 0)
+        return false;
+    auto pred_nodes = concat_node.get_dependencies();
+    for (auto p : pred_nodes) {
+        // TODO : In dynamic shape only one user is allowed for optimzied concat
+        // It is mainly because of the limited flexibility of current exec order
+        // For now, we are doing shape_infer for all pred nodes and concats when executing one of the predecessors for runtime buffer fusing
+        // So we need to ensure that shape_infer of the all the parents of other predecessors are done.
+        // We need to shuffle the exec order for that requirement, but currently only simple method is applied which is only applicable
+        // for simple patterns where the concat is the only user of all the preds.
+        // Also cascaded concat is not handled for dynamic shape. for now.
+        // If we have more flexible exec order handling in the future we'll be able to remove this condition below
+        if (p.first->is_dynamic() && p.first->get_users().size() > 1)
+            return false;
+        if (concat_node.is_dynamic() && !p.first->is_dynamic())
+            return false;
+    }
+    // if this is called in primitive_inst::execute() and concat is static, that concat should already be optimized in build time, not in runtime.
+    if (is_runtime && !concat_node.is_dynamic())
+        return false;
    bool is_onednn_impl = false;

-    for (const auto& input : node.get_dependencies()) {
-        if (input.first->get_preferred_impl_type() == impl_types::onednn) {
-            for (const auto& fused_op : input.first->get_fused_primitives()) {
-                auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(*input.first, fused_op);
+    // For in place concatenation input layouts and data types must match.
+    // Also, it checks whether data along f-axis is aligned properly for implicit concat.
+    // Otherwise, use explicit concat instead.
+    auto output_format = concat_params.get_output_layout().format;
+    auto output_datatype = concat_params.get_output_layout().data_type;
+    auto concat_axis = concat_params.typed_desc<concatenation>()->axis;
+
+    auto def_fmt = format::get_default_format(concat_params.get_output_layout().get_rank());
+    auto lower_padd_in_axis = concat_params.get_output_layout().data_padding.lower_size().sizes(def_fmt)[concat_axis];
+    lower_padd_in_axis = std::max(lower_padd_in_axis,
+                                  pred_params[0].get_output_layout().data_padding.lower_size().sizes(def_fmt)[concat_axis]);
+
+    size_t idx = 0;
+    for (auto pred : pred_nodes) {
+        if (!available_pred(*pred.first))
+            return false;
+        if (pred.first->is_output())
+            return false;
+        // if an input is marked as network output, prevent optimizations
+        // which would affect a form of its output (unless debug flag is set),
+        // we also need to restrict input types to those which support padding on all axis
+        if (pred.first->is_dynamic() && is_runtime) {
+            if (!pred.first->is_padding_supported(concat_axis, lower_padd_in_axis))
+                return false;
+        }
+        // TODO: handle optimized reshape
+        if (pred.first->is_type<reshape>() && pred.first->can_be_optimized())
+            return false;
+        // TODO: Investigate if this condition is needed
+        if (pred.first->get_users().size() > 2)
+            return false;
+
+       // Check that input isn't optimized out concatenation along different axis.
+        if (pred.first->is_type<concatenation>() && pred.first->can_be_optimized()) {
+            // cascaded concat opt is not supported for dynamic shape yet
+            if (concat_node.is_dynamic() || is_runtime)
+                return false;
+            else if (pred.first->as<concatenation>().get_primitive()->axis != concat_axis)
+                return false;
+        }
+        // Check that input isn't optimized out non-concatenation.
+        if (!pred.first->is_type<concatenation>() && pred.first->can_be_optimized())
+            return false;
+
+        size_t concat_users = 0;
+        for (auto& user : pred.first->get_users())
+            if (user->is_type<concatenation>())
+                concat_users += 1;
+
+        // If input is used by more than one concatenation then they may require different paddings.
+        if (concat_users != 1)
+            return false;
+
+        layout pred_l = pred_params[idx].get_output_layout();
+        if (output_format != pred_l.format || output_datatype != pred_l.data_type)
+            return false;
+        if (pred_l.format.block_sizes().size() > 1)
+            return false;
+        // TODO: Below condition should be moved to program_node::supports_padding.
+        // This however will need updating the algorithm as it may make cascade adjustment impossible in some cases.
+        // It however would make normal optimizations possible in others, so this is a trade-off to be investigated.
+        if (idx != concat_node.get_dependencies().size() - 1) {
+            if ((pred_l.format == format::b_fs_yx_fsv16 || pred_l.format == format::b_fs_zyx_fsv16) &&
+                (pred_l.feature() % 16 != 0 || concat_axis != 1))
+                return false;
+
+            if ((pred_l.format == format::b_fs_yx_fsv32 || pred_l.format == format::b_fs_zyx_fsv32) &&
+                (pred_l.feature() % 32 != 0 || concat_axis != 1))
+                return false;
+
+            if (pred_l.format == format::b_fs_yx_fsv4 && (pred_l.feature() != 4 || concat_axis != 1))
+                return false;
+        }
+        if (pred.first->get_preferred_impl_type() == impl_types::onednn) {
+            for (const auto& fused_op : pred_params[idx].fused_desc) {
+                auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(*pred.first, fused_op);
                if (add_type == add_fusing_type::sum)
                    return false;
                else
@ -102,21 +172,42 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
            }

            // Optimized-out input node is no longer onednn impl.
-            if (!input.first->can_be_optimized())
+            if (!pred.first->can_be_optimized())
                is_onednn_impl = true;
        }
+        // If sibling is using onednn impl and batch > 1, the onednn impl cannot process the implicit concat'ed buffer.
+        // Onednn impls can process implicit concat'ed buffer only through buffer pointer manipulation.
+        if ((is_runtime && concat_params.get_output_layout().batch() > 1) ||
+            (!concat_node.is_dynamic() && concat_params.get_output_layout().batch() > 1)) {
+            for (auto& sib : pred.first->get_users()) {
+                if (sib->get_preferred_impl_type() == impl_types::onednn) {
+                    return false;
+                }
+            }
+        }
+        auto input_padd = pred.first->get_output_layout().data_padding;
+
+        // Check that there isn't already some padding between inputs in concat axis.
+        // If node has already been optimized we skip this check - this is just cascade adjustment.
+        if (!concat_node.can_be_optimized()) {
+            if (idx != concat_node.get_dependencies().size() && input_padd.upper_size().sizes(def_fmt)[concat_axis] != 0)
+                return false;
+            if (idx != 0 && input_padd.lower_size().sizes(def_fmt)[concat_axis] != 0)
+                return false;
+        }
+        if (!concat_node.is_dynamic() || is_runtime)
+            lower_padd_in_axis += pred_params[idx].get_output_layout().get_tensor().sizes(def_fmt)[concat_axis];
+        idx++;
    }

    // Implicit concat for onednn only when use_usm and batch 1.
    if (is_onednn_impl) {
-        bool use_usm = node.get_program().get_engine().use_unified_shared_memory();
-        layout out_l = node.get_output_layout();
-
+        bool use_usm = concat_node.get_program().get_engine().use_unified_shared_memory();
+        layout concat_out_l = concat_params.get_output_layout();
        if (!use_usm)
            return false;
-        if (out_l.batch() > 1)
+        if (concat_out_l.batch() > 1)
            return false;
-
        // TODO: cldnn cases should be updated. This logic is working for onednn only.
        //       white list for support fusing formats.
        const std::vector<format> white_list = {
@ -128,140 +219,61 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
            format::b_fs_zyx_fsv32,
            format::b_fs_yx_fsv4,
        };
-        if (std::find_if(white_list.begin(), white_list.end(), [&out_l](format fmt){ return (fmt == out_l.format); }) == std::end(white_list))
+        if (std::find_if(white_list.begin(), white_list.end(), [&concat_out_l](format fmt){ return (fmt == concat_out_l.format); }) == std::end(white_list))
            return false;
    }
-
-    // For in place concatenation input layouts and data types must match.
-    // Also, it checks whether data along f-axis is aligned properly for implicit concat.
-    // Otherwise, use explicit concat instead.
-    auto output_format = node.get_output_layout().format;
-    auto output_datatype = node.get_output_layout().data_type;
-    auto concat_axis = node.get_primitive()->axis;
-    auto def_fmt = format::get_default_format(node.get_output_layout().get_rank());
-
-    size_t idx = 0;
-    for (const auto& input : node.get_dependencies()) {
-        if (input.first->is_type<reshape>())
-            // reshapes should be optimized out.
-            return false;
-
-        layout l = input.first->get_output_layout();
-
-        if (output_format != l.format || output_datatype != l.data_type)
-            return false;
-
-        if (l.format.block_sizes().size() > 1)
-            return false;
-
-        // TODO: Below condition should be moved to program_node::supports_padding.
-        // This however will need updating the algorithm as it may make cascade adjustment impossible in some cases.
-        // It however would make normal optimizations possible in others, so this is a trade-off to be investigated.
-        if (idx != node.get_dependencies().size() - 1) {
-            if ((l.format == format::b_fs_yx_fsv16 || l.format == format::b_fs_zyx_fsv16) &&
-                (l.feature() % 16 != 0 || node.get_primitive()->axis != 1))
-                return false;
-
-            if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
-                (l.feature() % 32 != 0 || node.get_primitive()->axis != 1))
-                return false;
-
-            if (l.format == format::b_fs_yx_fsv4 && (l.feature() != 4 || node.get_primitive()->axis != 1))
-                return false;
-        }
-        idx++;
-    }
-
-    auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().sizes(def_fmt)[concat_axis];
-    lower_padd_in_axis = std::max(lower_padd_in_axis,
-                                  node.get_dependency(0).get_output_layout().data_padding.lower_size().sizes(def_fmt)[concat_axis]);
-
-    // check if concatenation in place can be applied for inputs set
-    idx = 0;
-    for (const auto& input : node.get_dependencies()) {
-        // reverted condition - if any of this node's inputs is used by more than one primitive
-        // and is not optimized concatenation then do not fuse buffers
-        // todo: we need add padding support for all optimized kernels to remove this condition
-        if (!input.first->is_type<pooling>() && !input.first->is_type<convolution>() && !input.first->is_type<quantize>() &&
-            !input.first->is_type<activation>() && !input.first->is_type<deconvolution>() &&
-            !input.first->is_type<concatenation>() && !input.first->is_type<crop>() && !input.first->is_type<eltwise>() &&
-            !input.first->is_type<resample>())
-            return false;
-
-        // if an input is marked as network output, prevent optimizations
-        // which would affect a form of its output (unless debug flag is set),
-        // we also need to restrict input types to those which support padding on all axis
-        if (input.first->is_output() || !input.first->is_padding_supported(concat_axis, lower_padd_in_axis))
-            return false;
-
-        // TODO: Investigate if this condition is needed
-        if (input.first->get_users().size() > 2)
-            return false;
-
-        // If sibling is using onednn impl and batch > 1, the onednn impl cannot process the implicit concat'ed buffer.
-        // Onednn impls can process implicit concat'ed buffer only through buffer pointer manipulation.
-        if (node.get_output_layout().batch() > 1) {
-            for (auto& sib : input.first->get_users()) {
-                if (sib->get_preferred_impl_type() == impl_types::onednn) {
-                    return false;
-                }
-            }
-        }
-
-        // Check that input isn't optimized out concatenation along different axis.
-        if (input.first->is_type<concatenation>() && input.first->can_be_optimized() &&
-            input.first->as<concatenation>().get_primitive()->axis != concat_axis)
-            return false;
-
-        // Check that input isn't optimized out non-concatenation.
-        if (!input.first->is_type<concatenation>() && input.first->can_be_optimized())
-            return false;
-
-        size_t concat_users = 0;
-        for (auto& user : input.first->get_users())
-            if (user->is_type<concatenation>())
-                concat_users += 1;
-
-        // If input is used by more than one concatenation then they may require different paddings.
-        if (concat_users != 1)
-            return false;
-
-        auto input_padd = input.first->get_output_layout().data_padding;
-
-        // Check that there isn't already some padding between inputs in concat axis.
-        // If node has already been optimized we skip this check - this is just cascade adjustment.
-        if (!node.can_be_optimized()) {
-            if (idx != node.get_dependencies().size() && input_padd.upper_size().sizes(def_fmt)[concat_axis] != 0)
-                return false;
-            if (idx != 0 && input_padd.lower_size().sizes(def_fmt)[concat_axis] != 0)
-                return false;
-        }
-
-        lower_padd_in_axis += input.first->get_output_layout().get_tensor().sizes(def_fmt)[concat_axis];
-        idx += 1;
-    }
-
    return true;
 }

 void concat_in_place_optimization::optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization) {
-    auto out_layout = node.get_output_layout();
-    auto out_rank = out_layout.get_rank();
-    auto concat_axis = node.get_primitive()->axis;
-    // We need to transform axis from bf[w][z]yx order to bfxy[z][w] due to tensor.sizes() usages here
+     std::vector<layout> preds_layouts;
+    for (auto dep : node.get_dependencies()) {
+        if (dep.first->is_type<concatenation>() && dep.first->can_be_optimized())
+            need_reoptimization.push_back(&dep.first->as<concatenation>());
+        preds_layouts.push_back(dep.first->get_output_layout());
+    }
+    layout concat_layout = node.get_output_layout();
+    update_in_place_concat_paddings(concat_layout, preds_layouts, node.get_primitive()->axis, false);
+    size_t i = 0;
+    for (auto& dep : node.get_dependencies()) {
+        dep.first->set_output_layout(preds_layouts[i]);
+        dep.first->can_share_buffer(false);
+        ++i;
+    }
+    node.set_output_layout(concat_layout);
+    node.can_be_optimized(true);
+}
+
+void concat_in_place_optimization::update_in_place_concat_paddings(
+                                                    layout& concat_out_layout,
+                                                    std::vector<layout>& preds_layouts,
+                                                    size_t concat_axis,
+                                                    bool is_runtime) {
+    auto concat_out_rank = concat_out_layout.get_rank();
+    // We need to transform axis from bf[v][u][w][z]yx order to bfxy[z][w][u][v] due to tensor.sizes() usages here
    // should be removed once pad representation is changed
    auto concat_axis_legacy = concat_axis;
    if (concat_axis_legacy >= 2) {
        auto spatial_axis = concat_axis_legacy - 2;
        // Default and minimum number of dimensions is 4
-        auto spatial_size = std::max<size_t>(out_rank, 4) - 2;
+        auto spatial_size = std::max<size_t>(concat_out_rank, 4) - 2;
        concat_axis_legacy = spatial_size - spatial_axis - 1 + 2;
    }

+    if (concat_out_layout.is_dynamic() && !is_runtime) {
+        // set dynamic pad dims for shape agnostic kernel
+        for (auto& dep_output_layout : preds_layouts) {
+            auto info_dynamic_pad = tensor(0).sizes();
+            info_dynamic_pad[concat_axis_legacy] = 1;
+            dep_output_layout.data_padding.set_dynamic_pad(tensor(info_dynamic_pad));
+        }
+        return;
+    }
+
    // Select output padding by propagating all required input paddings.
-    auto padd = out_layout.data_padding;
-    for (auto input : node.get_dependencies()) {
-        auto inputPadding = input.first->get_output_layout().data_padding;
+    auto padd = concat_out_layout.data_padding;
+    for (auto input : preds_layouts) {
+        auto inputPadding = input.data_padding;
        padd = padding::max(padd, inputPadding);
    }

@ -270,21 +282,17 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st

    // For cascade adjustment override padding in concat axis to output padding.
    // In other case match(...) already checked that only first/last input have lower/upper padding.
-    if (node.can_be_optimized()) {
-        lower_padd[concat_axis_legacy] = out_layout.data_padding.lower_size().sizes()[concat_axis_legacy];
-        upper_padd[concat_axis_legacy] = out_layout.data_padding.upper_size().sizes()[concat_axis_legacy];
-    }
-    node.set_output_padding(padding(lower_padd, upper_padd));
+    lower_padd[concat_axis_legacy] = concat_out_layout.data_padding.lower_size().sizes()[concat_axis_legacy];
+    upper_padd[concat_axis_legacy] = concat_out_layout.data_padding.upper_size().sizes()[concat_axis_legacy];
+    auto dyn_pad_dims = lower_padd;
+    dyn_pad_dims[concat_axis_legacy] = 1;
+    concat_out_layout.data_padding = padding(lower_padd, upper_padd);

-    upper_padd[concat_axis_legacy] += out_layout.get_dims()[concat_axis];
-
-    // apply concatenation in place optimization
-    for (const auto& input : node.get_dependencies()) {
-        auto input_length = input.first->get_output_layout().get_dims()[concat_axis];
-
-        if (input.first->is_type<concatenation>() && input.first->can_be_optimized())
-            need_reoptimization.push_back(&input.first->as<concatenation>());
+    upper_padd[concat_axis_legacy] += concat_out_layout.get_dims()[concat_axis];

+     // apply concatenation in place optimization
+    for (auto& pred_layout : preds_layouts) {
+        auto input_length = pred_layout.get_dims()[concat_axis];
        // shrink upper pad so it points at the end of the input's buffer
        //
        //   |--- lower padd ---|                    |---------- upper padd -----------|
@ -292,22 +300,18 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st
        upper_padd[concat_axis_legacy] -= input_length;

        // set new padding for input
-        input.first->set_output_padding(padding(lower_padd, upper_padd));
-
+        if (is_runtime)
+            pred_layout.data_padding = padding(lower_padd, upper_padd, 0.f, tensor(dyn_pad_dims));
+        else
+            pred_layout.data_padding = padding(lower_padd, upper_padd, 0.f);
        // move lower padd further
        //
        //   |-------------- lower padd -------------|---------- upper padd -----------|
        //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
        lower_padd[concat_axis_legacy] += input_length;
    }
-
-    node.can_be_optimized(true);
-    for (auto dep : node.get_users()) {
-        dep->can_share_buffer(false);
-    }
 }
-
-}  // namespace
+}  // namespace cldnn

 static bool can_reshape_be_optimized(const reshape_node& node) {
    return node.is_in_place() && !node.has_fused_primitives();
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.h
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.h
@ -0,0 +1,62 @@
+// copyright (c) 2023 intel corporation
+// spdx-license-identifier: apache-2.0
+//
+
+#include "pass_manager.h"
+#include "program_helpers.h"
+
+#include "concatenation_inst.h"
+
+#include <utility>
+#include <list>
+#include <vector>
+
+using namespace cldnn;
+namespace cldnn {
+struct concat_noop_optimization : pattern_match_optimization_typed<concat_noop_optimization, concatenation> {
+    // Removes concatenation nodes with single input.
+    using base = pattern_match_optimization_typed<concat_noop_optimization, concatenation>;
+    using base::base;
+
+    bool match(concatenation_node& node);
+    bool optimize(concatenation_node& node);
+};
+
+struct concat_in_place_optimization : pattern_match_optimization_typed<concat_in_place_optimization, concatenation> {
+    // Performs in-place concat optimization.
+    // Padding of predecessors is updated to use single buffer by all, which is output from concatenation.
+    // Then concatenation can be optimized out, as memory will be correctly filled by previous nodes.
+    // If one of the dependencies is also optimized-out concatenation, then cascade adjusment is performed to update it.
+    // This optimization is expected to be executed in some topological order, as cascade adjustment is performed backwards.
+    using base = pattern_match_optimization_typed<concat_in_place_optimization, concatenation>;
+    using base::base;
+
+    // Runs concat in-place optimization and adds already optimized concatenations that need re-optimization to
+    // `needs_reoptimization`.
+    void optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization);
+    static void update_in_place_concat_paddings(layout& concat_layout,
+                                 std::vector<layout>& preds_layouts,
+                                 size_t concat_axis,
+                                 bool is_runtime);
+    bool match(concatenation_node& node);
+    static bool match(const program_node& concat_node,
+                      kernel_impl_params concat_params,
+                      std::vector<kernel_impl_params> pred_params,
+                      bool is_runtime = false);
+    bool optimize(concatenation_node& node) {
+        std::list<concatenation_node*> need_reopt;
+        optimize_cascade(node, need_reopt);
+        while (!need_reopt.empty()) {
+            auto& prop = *need_reopt.front();
+            need_reopt.pop_front();
+            if (match(prop))
+                optimize_cascade(prop, need_reopt);
+            else
+                // TODO: Revert extra padding when cascade adjustment failed.
+                prop.can_be_optimized(false);
+        }
+        return false;  // node not invalidated
+    }
+};
+
+} // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@ -156,6 +156,13 @@ public:
    virtual void set_output_memory(memory::ptr mem, bool check = true, size_t idx = 0);
    void check_memory_to_set(const memory& mem, const layout& layout) const;
    const std::list<const cldnn::program_node *>& get_users() const { return _node->get_users(); }
+    std::vector<std::shared_ptr<primitive_inst>> get_user_insts() const {
+        std::vector<primitive_id> users;
+        for (auto u : get_users()) {
+            users.push_back(u->id());
+        }
+        return _network.get_primitives(users);
+    }

    const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
    // return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
@ -199,7 +206,7 @@ public:
    void set_shape_change() { _shape_changed = true; }

    void build_deps();
-
+    void do_runtime_in_place_concat();
    memory::ptr fused_memory(size_t dep_id) const {
        return dep_memory_ptr(get_fused_mem_offset() + dep_id);
    }
@ -220,7 +227,7 @@ public:

    void allocate_internal_buffers();
    static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
-            const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true);
+            const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false);

    std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }

@ -257,6 +264,7 @@ protected:
    program_node const* _node;
    layout _node_output_layout;

+    bool update_shape_done_by_other = false;
    std::unique_ptr<kernel_impl_params> _impl_params;
    std::unique_ptr<primitive_impl> _impl;
    std::unique_ptr<primitive_impl> _dynamic_impl = nullptr;
@ -317,7 +325,7 @@ protected:
    size_t max_output_layout_size = 0;
    std::vector<size_t> max_intermediates_memory_sizes;

-    std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true);
+    std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true, bool runtime_alloc = false);
    memory::ptr allocate_internal_buffer(size_t idx);
    static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
        std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@ -1083,9 +1083,35 @@ void network::build_insts_deps() {

 void network::build_exec_order() {
    GPU_DEBUG_DEFINE_MEM_LOGGER("build_exec_order");
-    for (auto& node : _program->get_processing_order()) {
-        if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty())) {
-            add_to_exec_order(node->id());
+    if (!_is_dynamic) {
+        for (auto& node : _program->get_processing_order()) {
+            if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty())) {
+                add_to_exec_order(node->id());
+            }
+        }
+    } else {
+        auto is_runtime_optimized_concat = [&](const program_node* node) {
+            return (node->is_dynamic() && node->is_type<concatenation>() && node->can_be_optimized());
+        };
+        auto is_allowed_pred_for_runtime_optimized_concat = [&](const program_node* node) {
+            return (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty()) &&
+                    node->get_users().size() == 1 && is_runtime_optimized_concat(node->get_users().front()));
+        };
+        for (auto& node : _program->get_processing_order()) {
+            if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty())) {
+                if (is_allowed_pred_for_runtime_optimized_concat(node)) {
+                    continue;
+                } else if (is_runtime_optimized_concat(node)) {
+                    // For in-place concat applied at runtime, we need to do update_shape for all other predecessors of the concat user.
+                    // i.e., We need to make sure that all the preds of them are already updated too.
+                    for (auto dep : node->get_dependencies()) {
+                        if (!dep.first->is_type<data>()) {
+                            add_to_exec_order(dep.first->id());
+                        }
+                    }
+                }
+                add_to_exec_order(node->id());
+            }
        }
    }
 }
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -1,7 +1,7 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
+#include "program_helpers.h"
 #include "primitive_inst.h"
 #include "data_inst.h"
 #include "mutable_data_inst.h"
@ -11,6 +11,10 @@
 #include "fully_connected_inst.h"
 #include "convolution_inst.h"
 #include "crop_inst.h"
+#include "pooling_inst.h"
+#include "permute_inst.h"
+#include "resample_inst.h"
+#include "reshape_inst.h"
 #include "eltwise_inst.h"
 #include "deconvolution_inst.h"
 #include "shape_of_inst.h"
@ -18,6 +22,7 @@
 #include "experimental_detectron_roi_feature_extractor_inst.hpp"
 #include "compilation_context.hpp"
 #include "implementation_map.hpp"
+#include "graph_optimizer/prepare_buffer_fusing.h"

 #include "intel_gpu/plugin/common_utils.hpp"
 #include "intel_gpu/graph/network.hpp"
@ -41,7 +46,8 @@
 namespace cldnn {
 namespace {

-bool is_optimized_output_user(const program_node* user) {
+template <typename T>
+bool is_optimized_output_user(const T user) {
    if (user->can_be_optimized()) {
        if (user->is_output())
            return true;
@ -56,18 +62,25 @@ bool is_optimized_output_user(const program_node* user) {
    }
    return false;
 }
-
-bool is_output_buffer(const program_node& node) {
-    if (node.is_output())
+bool is_output_buffer(const primitive_inst* prim, bool runtime_alloc) {
+    if (prim->is_output())
        return true;

    // Try to recursively find any optimized out user which is also network output
-    for (const auto& user : node.get_users()) {
-        if (is_optimized_output_user(user)) {
-            return true;
+    if (runtime_alloc) {
+        // Try to recursively find any optimized out user which is also network output
+        for (const auto& user : prim->get_user_insts()) {
+            if (is_optimized_output_user<const std::shared_ptr<primitive_inst>>(user)) {
+                return true;
+            }
+        }
+    } else {
+        for (const auto& user : prim->get_node().get_users()) {
+            if (is_optimized_output_user<const program_node*>(user)) {
+                return true;
+            }
        }
    }
-
    return false;
 }

@ -179,7 +192,12 @@ void primitive_inst::set_output_memory(memory::ptr mem_new, bool check, size_t i

 void primitive_inst::update_shape() {
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
-
+    if (update_shape_done_by_other) {
+        update_shape_done_by_other = false; // reset
+        GPU_DEBUG_TRACE_DETAIL << id() << ": update shape is done by other: "
+                               << _impl_params->output_layouts[0].to_short_string() << std::endl;
+        return;
+    }
    bool input_shape_changed = false;
    for (size_t i = 0; i < _deps.size(); i++) {
        auto idx = _deps[i].second;
@ -279,6 +297,15 @@ event::ptr primitive_inst::realloc_if_needed() {
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);

    event::ptr ev = nullptr;
+    if (_node->get_users().size() == 1 && _node->get_users().front()->is_type<concatenation>()) {
+        auto concat_inst = _network.get_primitive(get_users().front()->id());
+        if (concat_inst->can_be_optimized()) {
+            concat_inst->realloc_if_needed();
+            this->_outputs[0] = concat_inst->_outputs[0];
+            GPU_DEBUG_TRACE_DETAIL << id() << ": use concat user's memory " << this->_outputs[0]->buffer_ptr() << std::endl;
+            return ev;
+        }
+    }
    // Update param if fake_alignment is available
    auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
    auto actual_layout = updated_params.get_output_layout();
@ -292,7 +319,8 @@ event::ptr primitive_inst::realloc_if_needed() {

    if (can_reuse_buffer) {
        GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
-        _outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
+        if (_outputs[0]->get_layout() != actual_layout)
+            _outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
        if (need_reset_output_memory()) {
            ev = _outputs[0]->fill(_network.get_stream());
        }
@ -300,7 +328,7 @@ event::ptr primitive_inst::realloc_if_needed() {
        GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
                               <<  " Current buffer_size=" << max_output_layout_size
                               <<  " Requested buffer_size=" << actual_layout.count() << std::endl;
-        _outputs = allocate_outputs(&updated_params, need_reset_output_memory());
+        _outputs = allocate_outputs(&updated_params, need_reset_output_memory(), true);
        // TODO : need to handle multiple outputs
        max_output_layout_size = updated_params.output_layouts[0].count();
    }
@ -385,14 +413,14 @@ bool primitive_inst::update_impl() {
                auto data_padding = params.output_layouts[i].data_padding;
                for (size_t j = 0; j < output_shape_max_rank.size(); j++) {
                    if (is_dynamic_pad[j] == 1) {
-                        GPU_DEBUG_TRACE_DETAIL
-                            << " shape_info[" << offset << "] = " << data_padding.lower_size().sizes()[j]
-                            << "(pad_before for output[" << i << "] " << j << "-th dim)" << std::endl;
-                        lock[offset++] = data_padding.lower_size().sizes()[j];  // pad_before
-                        GPU_DEBUG_TRACE_DETAIL << " shape_info[" << offset
-                                               << "] = " << data_padding.lower_size().sizes()[j]
+                        auto lower_pads = data_padding.lower_size().sizes(format::get_default_format(layout::max_rank()));
+                        GPU_DEBUG_TRACE_DETAIL << " shape_info[" << offset << "] = " << lower_pads[j]
+                                               << "(pad_before for output[" << i << "] " << j << "-th dim)" << std::endl;
+                        lock[offset++] = lower_pads[j];
+                        auto upper_pads = data_padding.upper_size().sizes(format::get_default_format(layout::max_rank()));
+                        GPU_DEBUG_TRACE_DETAIL << " shape_info[" << offset << "] = " << upper_pads[j]
                                               << "(pad_after for output[" << i << "] " << j << "-th dim)" << std::endl;
-                        lock[offset++] = data_padding.upper_size().sizes()[j];  // pad_after
+                        lock[offset++] = upper_pads[j];  // pad_after
                    }
                }
            }
@ -415,10 +443,18 @@ bool primitive_inst::update_impl() {
            updated_params.weights_layout = optional_layout(original_weights_memory->get_layout());
        }

+        auto updated_params_no_dyn_pad = updated_params;
+        for (auto& i : updated_params_no_dyn_pad.input_layouts) {
+            i.data_padding.set_dynamic_pad(tensor(0));
+        }
+        for (auto& o : updated_params_no_dyn_pad.output_layouts) {
+            o.data_padding.set_dynamic_pad(tensor(0));
+        }
+
        auto& cache = get_network().get_program()->get_implementations_cache();
        std::shared_ptr<primitive_impl> cached_impl = nullptr;
        {
-            cached_impl = cache.get(updated_params);
+            cached_impl = cache.get(updated_params_no_dyn_pad);
            if (cached_impl) {
                _impl = cached_impl->clone();
                GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
@ -431,13 +467,6 @@ bool primitive_inst::update_impl() {
        if (!cached_impl) {
            if (_dynamic_impl) {
                auto& compilation_context = get_network().get_program()->get_compilation_context();
-                auto updated_params_no_dyn_pad = updated_params;
-                for (auto& i : updated_params_no_dyn_pad.input_layouts) {
-                    i.data_padding.set_dynamic_pad(tensor(0));
-                }
-                for (auto& o : updated_params_no_dyn_pad.output_layouts) {
-                    o.data_padding.set_dynamic_pad(tensor(0));
-                }
                compilation_context.push_task(updated_params_no_dyn_pad.hash(), [this, &compilation_context, updated_params_no_dyn_pad]() {
                    if (compilation_context.is_stopped())
                        return;
@ -454,25 +483,23 @@ bool primitive_inst::update_impl() {
                    if (!can_be_optimized()) {
                        auto kernels = _program->get_kernels_cache().compile(updated_params_no_dyn_pad, impl->get_kernels_source());
                        impl->set_kernels(kernels);
+                        cache.add(updated_params_no_dyn_pad, impl->clone());
                    }
-                    cache.add(updated_params_no_dyn_pad, impl->clone());
                });
                if (!can_be_optimized())  {
                    _impl = _dynamic_impl->clone();
                    auto new_impl_params = _impl->canonicalize_shapes(*_impl_params);
                    _impl->update_dispatch_data(new_impl_params);
-
                    update_shape_info(new_impl_params);
                }
            } else {
-                _impl = _node->type()->choose_impl(*_node, updated_params);
+                _impl = _node->type()->choose_impl(*_node, updated_params_no_dyn_pad);
                if (!can_be_optimized()) {
                    auto& kernels_cache = get_network().get_program()->get_kernels_cache();
-                    auto kernels = kernels_cache.compile(updated_params, _impl->get_kernels_source());
+                    auto kernels = kernels_cache.compile(updated_params_no_dyn_pad, _impl->get_kernels_source());
                    _impl->set_kernels(kernels);
+                    cache.add(updated_params_no_dyn_pad, _impl->clone());
                }
-                cache.add(updated_params, _impl->clone());
-
                auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
                GPU_DEBUG_TRACE_DETAIL << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
            }
@ -484,6 +511,64 @@ bool primitive_inst::update_impl() {
    return true;
 }

+void primitive_inst::do_runtime_in_place_concat() {
+    if (update_shape_done_by_other)
+        return;
+    if (get_users().size() != 1) return;
+
+    auto concat_inst = _network.get_primitive(get_users().front()->id());
+    if (!concat_inst->get_node().is_type<concatenation>() || !concat_inst->get_node().can_be_optimized())
+        return;
+    // Currently does not support cascaded concats
+    std::vector<std::shared_ptr<primitive_inst>> concat_preds;
+    for (auto pred : concat_inst->_deps) {
+        concat_preds.push_back(pred.first);
+    }
+
+    GPU_DEBUG_TRACE_DETAIL << "[In place concat] Preparing for runtime buffer fusing" << std::endl;
+    // Do shape_infer for all concat's preds and concat
+    for (auto pred : concat_preds) {
+        if (!pred->update_shape_done_by_other) {
+            GPU_DEBUG_TRACE_DETAIL << "[In place concat] update shape for " << pred->id() << std::endl;
+            pred->update_shape();
+            pred->update_shape_done_by_other = true;
+        }
+    }
+    GPU_DEBUG_TRACE_DETAIL << "[In place concat] update shape for " << concat_inst->id() << std::endl;
+    concat_inst->update_shape();
+    concat_inst->update_shape_done_by_other = true;
+    layout concat_layout = concat_inst->_impl_params->get_output_layout();
+
+    std::vector<kernel_impl_params> pred_params;
+    std::vector<layout> preds_layouts;
+    for (auto pred : concat_inst->_deps) {
+        pred_params.push_back(*pred.first->_impl_params);
+        preds_layouts.push_back(pred.first->_impl_params->get_output_layout());
+    }
+
+    if (!concat_in_place_optimization::match(concat_inst->get_node(), *concat_inst->_impl_params, pred_params, true)) {
+        concat_inst->_can_be_optimized = false;
+        GPU_DEBUG_TRACE_DETAIL << "[In place concat] " << concat_inst->id() << " cannot be optimized " << std::endl;
+        return;
+    }
+
+    auto concat_axis = concat_inst->_impl_params->typed_desc<concatenation>()->axis;
+    concat_in_place_optimization::update_in_place_concat_paddings(concat_layout, preds_layouts, concat_axis, true);
+    size_t i = 0;
+    for (auto& dep : concat_inst->_deps) {
+        if (_impl_params->output_layouts[0] != preds_layouts[i]) {
+            dep.first->set_shape_change();
+            dep.first->_impl_params->output_layouts[0] = preds_layouts[i];
+        }
+        GPU_DEBUG_TRACE_DETAIL << "[In place concat] Update padding of pred " << i << " : "
+                               << dep.first->_impl_params->output_layouts[0].to_string() << std::endl;
+        ++i;
+    }
+    concat_inst->_impl_params->output_layouts[0] = concat_layout;
+    concat_inst->_can_be_optimized = true;
+    GPU_DEBUG_TRACE_DETAIL << "[In place concat] " << concat_inst->id() << ": can_be_optimized " << std::endl;
+}
+
 event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
    const auto primitive_id = id();
    OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input");
@ -491,10 +576,13 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {

    std::vector<event::ptr> dependencies;
    if (is_dynamic()) {
+        do_runtime_in_place_concat();
        OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null");
        update_shape();
-        if (_impl_params->output_layouts[0].bytes_count() == 0) {
+        if (_impl_params->output_layouts[0].count() == 0) {
+            GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl;
            auto ev = get_network().get_stream().create_user_event(true);
+            update_shape_done_by_other = false; // reset
            return ev;
        }

@ -539,7 +627,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
            }
        }
    }
-
+    update_shape_done_by_other = false; // reset
    OPENVINO_ASSERT(_impl_params->get_output_layout().is_static(),
                    "[GPU] Can't execute ", primitive_id, " primitive as output layout is dynamic in runtime");

@ -914,7 +1002,7 @@ static bool user_requesting_mem_reuse_false(const program_node& node) {
 }

 memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params,
-                                            uint32_t net_id, bool is_internal, size_t idx, bool reset) {
+                                            uint32_t net_id, bool is_internal, size_t idx, bool reset, bool is_output_buffer) {
    auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
            allocation_type type, bool reusable, bool reset = true) {
        OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
@ -925,7 +1013,6 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
        return pool.get_memory(static_layout, type, reset);
    };

-
    auto layout = impl_params.get_output_layout(idx);
    OPENVINO_ASSERT(layout.is_static() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout");
    auto device_mem_acc = [&](size_t a, const cldnn::layout& l) {
@ -942,16 +1029,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
    if (total_device_input_mem_size > _engine.get_device_info().max_global_mem_size)
        usm_device_allocatable = false;

-    bool memory_reuse_by_user = true;
-
-    if (user_requesting_mem_reuse_false(_node)) {
-        memory_reuse_by_user = false;
-    }
+    bool memory_reuse_by_user = !user_requesting_mem_reuse_false(_node);

    // For outputs, cpu prim we want to have lockable alloc type
    // Also if the successor of a node is an cpu, then memory needs to be lockable.
    bool is_cpu = _node.get_selected_impl() ? _node.get_selected_impl()->is_cpu() : false;
-    auto use_lockable_memory = is_output_buffer(_node) || is_cpu || is_any_user_cpu(_node.get_users()) ||
+    auto use_lockable_memory = is_output_buffer|| is_cpu || is_any_user_cpu(_node.get_users()) ||
                               !_engine.supports_allocation(allocation_type::usm_device) ||
                               (_node.is_shape_infer_dep() && _engine.get_device_info().dev_type == device_type::integrated_gpu);
    const auto& lockable_mem_type = _engine.get_lockable_preferred_memory_allocation_type(layout.format.is_image_2d());
@ -959,47 +1042,46 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
    auto alloc_type = use_lockable_memory ? lockable_mem_type
                    : !usm_device_allocatable ? lockable_mem_type : allocation_type::usm_device;

-    if ((is_internal && (_node.can_be_optimized() || _node.is_type<generic_layer>())) || (memory_reuse_by_user == false)) {
-        GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
-        // Use usm_device memory for weights reordering
-        if (is_internal && _node.is_type<generic_layer>() && _engine.supports_allocation(allocation_type::usm_device))
-            alloc_type = allocation_type::usm_device;
-        return get_memory_from_pool(_engine,
-                layout,
-                _node.id(),
-                _node.get_memory_dependencies(),
-                alloc_type,
-                false,
-                reset);
-    } else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
-            _engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
-        GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
-        return _engine.allocate_memory(layout, allocation_type::usm_device, false);
-    } else if (is_internal && !_node.is_output() && _node.is_type<input_layout>()) {
-        // Skip memory reset for input_layout primitives, since data will be copied from cldnn::data primitive
-        // or just reuse primitive's memory
-        GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
-        return _engine.allocate_memory(layout, alloc_type, false);
-    } else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
+    if (is_internal) {
+        if (_node.can_be_optimized() || _node.is_type<generic_layer>()) {
+            GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
+            // Use usm_device memory for weights reordering
+            if (is_internal && _node.is_type<generic_layer>() &&
+                _engine.supports_allocation(allocation_type::usm_device))
+                alloc_type = allocation_type::usm_device;
+            return get_memory_from_pool(_engine,
+                                        layout,
+                                        _node.id(),
+                                        _node.get_memory_dependencies(),
+                                        alloc_type,
+                                        false,
+                                        reset);
+        } else {
+            if ((_node.is_output() && _node.is_type<generic_layer>()) || (!_node.is_output() && _node.is_type<input_layout>()))
+                reset = false;
+            GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
+            return _engine.allocate_memory(layout, alloc_type, reset);
+        }
+    } else if (!_node.can_share_buffer() || _node.can_be_optimized() || _node.is_output()) {
        GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
        return _engine.allocate_memory(layout, alloc_type, reset);
    } else {
        return get_memory_from_pool(_engine,
-                layout,
-                _node.id(),
-                _node.get_memory_dependencies(),
-                alloc_type,
-                true,
-                reset);
+                                    layout,
+                                    _node.id(),
+                                    _node.get_memory_dependencies(),
+                                    alloc_type,
+                                    memory_reuse_by_user,
+                                    reset);
    }
 }

-std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params, bool reset_mem) {
+std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params, bool reset_mem, bool runtime_alloc) {
    std::vector<memory::ptr> outputs;
    for (size_t i = 0; i < get_node().get_outputs_count() ; ++i) {
        outputs.push_back(allocate_output(get_network().get_engine(), _network.get_memory_pool(),
                         *_node, (updated_params != nullptr) ? *updated_params : *_impl_params,
-                         get_network_id(), _network.is_internal(), i, reset_mem));
+                         get_network_id(), _network.is_internal(), i, reset_mem, is_output_buffer(this, runtime_alloc)));
    }
    return outputs;
 }
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -1598,7 +1598,15 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
        } else if (node->is_type<mutable_data>() && node->get_dependencies().empty()) {
            continue;
        } else {
-            allocated_mem_ptrs.insert(primitive_inst::allocate_output(engine, pool, *node, *node->get_kernel_impl_params(), 0, false));
+            allocated_mem_ptrs.insert(primitive_inst::allocate_output(engine,
+                                                                      pool,
+                                                                      *node,
+                                                                      *node->get_kernel_impl_params(),
+                                                                      0,
+                                                                      false,
+                                                                      0,
+                                                                      false,
+                                                                      node->is_output()));
        }
    }

--- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@ -152,3 +152,107 @@ TEST(prepare_buffer_fusing, propagate_data_padding) {
        ASSERT_EQ(output_ptr[i], input_ptr[i]);
    }
 }
+
+TEST(prepare_buffer_fusing, in_place_concat_static) {
+    auto& engine = get_test_engine();
+    auto in_layout1 = layout{ ov::PartialShape{1, 2, 3, 4}, data_types::f32, format::bfyx }; // => {1, 4, 3, 2}
+    auto in_layout2 = layout{ ov::PartialShape{1, 2, 4, 1}, data_types::f32, format::bfyx }; // => {1, 4, 1, 2}
+    topology topology;
+    topology.add(input_layout("input1", in_layout1));
+    topology.add(input_layout("input2", in_layout2));
+    topology.add(permute("permute1", input_info("input1"), {0, 3, 2, 1}));
+    topology.add(permute("permute2", input_info("input2"), {3, 2, 0, 1}));
+    topology.add(concatenation("concat", { input_info("permute1"), input_info("permute2") }, 2));
+    topology.add(permute("output", input_info("concat"), {0, 2, 3, 1}));
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    auto prog = program::build_program(engine, topology, config, false, false);
+    ASSERT_NE(prog, nullptr);
+    cldnn::network net(prog, 0);
+
+    auto input_memory1 = engine.allocate_memory(in_layout1);
+    auto input_memory2 = engine.allocate_memory(in_layout2);
+    set_values<float>(input_memory1,
+                      {1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   11.0,   22.0,   33.0,   44.0,   55.0,   66.0,
+                       111.0, 222.0, 333.0, 444.0, 555.0, 666.0, 1111.0, 2222.0, 3333.0, 4444.0, 5555.0, 6666.0});
+    set_values<float>(input_memory2, {1234.0, 2345.0, 3456.0, 4567.0, 5678.0, 6789.0, 9012.0, 9999.0});
+
+    net.set_input_data("input1", input_memory1);
+    net.set_input_data("input2", input_memory2);
+    std::map<cldnn::primitive_id, cldnn::network_output> output;
+    EXPECT_NO_THROW(output = net.execute());
+    const auto& concat_node = net.get_primitive("concat")->get_node();
+    auto concat_mem = net.get_primitive("concat")->output_memory_ptr();
+    auto permute1_mem = net.get_primitive("permute1")->output_memory_ptr();
+    auto permute2_mem = net.get_primitive("permute1")->output_memory_ptr();
+    ASSERT_TRUE(concat_node.can_be_optimized());
+    ASSERT_EQ(concat_mem, permute1_mem);
+    ASSERT_EQ(concat_mem, permute2_mem);
+    auto out_lay = net.get_output_layout("output");
+    auto out_mem = output.at("output").get_memory();
+    cldnn::mem_lock<float> output_ptr(out_mem, get_test_stream());
+
+    std::vector<float> ref_output = {1.0,    2.0,    3.0,    4.0,    111.0,  222.0,  333.0,  444.0,  5.0,    6.0,   11.0,
+                                     22.0,   555.0,  666.0,  1111.0, 2222.0, 33.0,   44.0,   55.0,   66.0,   3333.0, 4444.0,
+                                     5555.0, 6666.0, 1234.0, 2345.0, 3456.0, 4567.0, 5678.0, 6789.0, 9012.0, 9999.0};
+
+    for (size_t x = 0; x < out_lay.count(); ++x) {
+        ASSERT_EQ(ref_output[x], output_ptr[x]);
+    }
+}
+
+TEST(prepare_buffer_fusing, in_place_concat_dynamic) {
+    auto& engine = get_test_engine();
+    auto in_layout1_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
+    auto in_layout2_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
+    auto in_layout1 = layout{ ov::PartialShape{1, 2, 3, 4}, data_types::f32, format::bfyx };
+    auto in_layout2 = layout{ ov::PartialShape{1, 2, 4, 1}, data_types::f32, format::bfyx };
+
+    topology topology;
+    topology.add(input_layout("input1", in_layout1_0));
+    topology.add(input_layout("input2", in_layout2_0));
+    topology.add(permute("permute1", input_info("input1"), {0, 3, 2, 1}));
+    topology.add(permute("permute2", input_info("input2"), {3, 2, 0, 1}));
+
+    topology.add(concatenation("concat", { input_info("permute1"), input_info("permute2") }, 2));
+    topology.add(permute("output", input_info("concat"), {0, 2, 3, 1}));
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    auto prog = program::build_program(engine, topology, config, false, false);
+    ASSERT_NE(prog, nullptr);
+    cldnn::network net(prog, 0);
+
+    auto input_memory1 = engine.allocate_memory(in_layout1);
+    auto input_memory2 = engine.allocate_memory(in_layout2);
+    set_values<float>(input_memory1,
+                      {1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   11.0,   22.0,   33.0,   44.0,   55.0,   66.0,
+                       111.0, 222.0, 333.0, 444.0, 555.0, 666.0, 1111.0, 2222.0, 3333.0, 4444.0, 5555.0, 6666.0});
+    set_values<float>(input_memory2, {1234.0, 2345.0, 3456.0, 4567.0, 5678.0, 6789.0, 9012.0, 9999.0});
+    net.set_input_data("input1", input_memory1);
+    net.set_input_data("input2", input_memory2);
+
+    std::vector<float> ref_output = {1.0,    2.0,    3.0,    4.0,    111.0,  222.0,  333.0,  444.0,  5.0,    6.0,   11.0,
+                                     22.0,   555.0,  666.0,  1111.0, 2222.0, 33.0,   44.0,   55.0,   66.0,   3333.0, 4444.0,
+                                     5555.0, 6666.0, 1234.0, 2345.0, 3456.0, 4567.0, 5678.0, 6789.0, 9012.0, 9999.0};
+
+    std::map<cldnn::primitive_id, cldnn::network_output> output;
+    EXPECT_NO_THROW(output = net.execute());
+    auto out_l = net.get_output_layout("output");
+    auto out_mem = output.at("output").get_memory();
+    cldnn::mem_lock<float> output_ptr(out_mem, get_test_stream());
+
+    const auto& concat_node = net.get_primitive("concat")->get_node();
+    auto concat_mem = net.get_primitive("concat")->output_memory_ptr();
+    auto permute1_mem = net.get_primitive("permute1")->output_memory_ptr();
+    auto permute2_mem = net.get_primitive("permute1")->output_memory_ptr();
+
+    ASSERT_TRUE(concat_node.can_be_optimized());
+    ASSERT_EQ(concat_mem.get(), permute1_mem.get());
+    ASSERT_EQ(concat_mem.get(), permute2_mem.get());
+    for (size_t x = 0; x < out_l.count(); ++x) {
+        ASSERT_EQ(ref_output[x], output_ptr[x]);
+    }
+}