From ce729761d69fb7d6058933bdc5f47f3d1bcd2659 Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Mon, 24 Jul 2023 14:56:21 -0700
Subject: [PATCH] Add new debug config "DisableRuntimeBufferFusing" (#18726)

---
 .../include/intel_gpu/runtime/debug_configuration.hpp    | 1 +
 .../src/graph/graph_optimizer/prepare_buffer_fusing.cpp  | 9 +++++++--
 src/plugins/intel_gpu/src/graph/primitive_inst.cpp       | 4 ++++
 .../intel_gpu/src/runtime/debug_configuration.cpp        | 5 ++++-
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index 2f4eb3128be..998fb93aca1 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -119,6 +119,7 @@ public:
     int max_kernels_per_batch;                  // Maximum number of kernels in a batch during compiling kernels
     int disable_async_compilation;              // Disable async compilation
     int disable_dynamic_impl;                   // Disable dynamic implementation
+    int disable_runtime_buffer_fusing;          // Disable runtime buffer fusing
     std::set<int64_t> dump_iteration;           // Dump n-th execution of network.
     static const debug_configuration *get_instance();
     bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 82f14d74de2..964986147f7 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -69,6 +69,11 @@ bool concat_in_place_optimization::match(const program_node& concat_node,
                                          bool is_runtime) {
     if (concat_node.is_output() || concat_params.fused_desc.size() > 0 || concat_node.is_in_shape_of_subgraph())
         return false;
+    bool do_runtime_buffer_fusing = true;
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
+        do_runtime_buffer_fusing = false;
+    }
     auto pred_nodes = concat_node.get_dependencies();
     for (auto p : pred_nodes) {
         // TODO : In dynamic shape only one user is allowed for optimzied concat
@@ -79,9 +84,9 @@ bool concat_in_place_optimization::match(const program_node& concat_node,
         // for simple patterns where the concat is the only user of all the preds.
         // Also cascaded concat is not handled for dynamic shape. for now.
         // If we have more flexible exec order handling in the future we'll be able to remove this condition below
-        if (p.first->is_dynamic() && p.first->get_users().size() > 1)
+        if (p.first->is_dynamic() && (!do_runtime_buffer_fusing || p.first->get_users().size() > 1))
             return false;
-        if (concat_node.is_dynamic() && !p.first->is_dynamic())
+        if (concat_node.is_dynamic() && (!do_runtime_buffer_fusing || !p.first->is_dynamic()))
             return false;
     }
     // if this is called in primitive_inst::execute() and concat is static, that concat should already be optimized in build time, not in runtime.
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index fdb6bb7a16e..da910cbb91e 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -625,6 +625,10 @@ bool primitive_inst::update_impl() {
 }
 
 void primitive_inst::do_runtime_in_place_concat() {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
+        return;
+    }
     if (update_shape_done_by_other)
         return;
     if (get_users().size() != 1) return;
diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
index ac92db226e9..23356a187bc 100644
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -133,6 +133,7 @@ static void print_help_messages() {
     message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
     message_list.emplace_back("OV_GPU_DisableAsyncCompilation", "Disable async compilation");
     message_list.emplace_back("OV_GPU_DisableDynamicImpl", "Disable dynamic implementation");
+    message_list.emplace_back("OV_GPU_DisableRuntimeBufferFusing", "Disable runtime buffer fusing");
     message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
     message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in"
                               "the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
@@ -175,7 +176,8 @@ debug_configuration::debug_configuration()
         , serialize_compile(0)
         , max_kernels_per_batch(0)
         , disable_async_compilation(0)
-        , disable_dynamic_impl(0) {
+        , disable_dynamic_impl(0)
+        , disable_runtime_buffer_fusing(0) {
 #ifdef GPU_DEBUG_CONFIG
     get_gpu_debug_env_var("Help", help);
     get_common_debug_env_var("Verbose", verbose);
@@ -205,6 +207,7 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);
     get_gpu_debug_env_var("DisableAsyncCompilation", disable_async_compilation);
     get_gpu_debug_env_var("DisableDynamicImpl", disable_dynamic_impl);
+    get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
     std::string dump_iteration_str;
     get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
     std::string mem_preallocation_params_str;