From ce729761d69fb7d6058933bdc5f47f3d1bcd2659 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Mon, 24 Jul 2023 14:56:21 -0700 Subject: [PATCH] Add new debug config "DisableRuntimeBufferFusing" (#18726) --- .../include/intel_gpu/runtime/debug_configuration.hpp | 1 + .../src/graph/graph_optimizer/prepare_buffer_fusing.cpp | 9 +++++++-- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 4 ++++ .../intel_gpu/src/runtime/debug_configuration.cpp | 5 ++++- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index 2f4eb3128be..998fb93aca1 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -119,6 +119,7 @@ public: int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels int disable_async_compilation; // Disable async compilation int disable_dynamic_impl; // Disable dynamic implementation + int disable_runtime_buffer_fusing; // Disable runtime buffer fusing std::set dump_iteration; // Dump n-th execution of network. static const debug_configuration *get_instance(); bool is_dumped_layer(const std::string& layerName, bool is_output = false) const; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 82f14d74de2..964986147f7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -69,6 +69,11 @@ bool concat_in_place_optimization::match(const program_node& concat_node, bool is_runtime) { if (concat_node.is_output() || concat_params.fused_desc.size() > 0 || concat_node.is_in_shape_of_subgraph()) return false; + bool do_runtime_buffer_fusing = true; + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) { + do_runtime_buffer_fusing = false; + } auto pred_nodes = concat_node.get_dependencies(); for (auto p : pred_nodes) { // TODO : In dynamic shape only one user is allowed for optimzied concat @@ -79,9 +84,9 @@ bool concat_in_place_optimization::match(const program_node& concat_node, // for simple patterns where the concat is the only user of all the preds. // Also cascaded concat is not handled for dynamic shape. for now. // If we have more flexible exec order handling in the future we'll be able to remove this condition below - if (p.first->is_dynamic() && p.first->get_users().size() > 1) + if (p.first->is_dynamic() && (!do_runtime_buffer_fusing || p.first->get_users().size() > 1)) return false; - if (concat_node.is_dynamic() && !p.first->is_dynamic()) + if (concat_node.is_dynamic() && (!do_runtime_buffer_fusing || !p.first->is_dynamic())) return false; } // if this is called in primitive_inst::execute() and concat is static, that concat should already be optimized in build time, not in runtime. diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index fdb6bb7a16e..da910cbb91e 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -625,6 +625,10 @@ bool primitive_inst::update_impl() { } void primitive_inst::do_runtime_in_place_concat() { + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) { + return; + } if (update_shape_done_by_other) return; if (get_users().size() != 1) return; diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index ac92db226e9..23356a187bc 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -133,6 +133,7 @@ static void print_help_messages() { message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels"); message_list.emplace_back("OV_GPU_DisableAsyncCompilation", "Disable async compilation"); message_list.emplace_back("OV_GPU_DisableDynamicImpl", "Disable dynamic implementation"); + message_list.emplace_back("OV_GPU_DisableRuntimeBufferFusing", "Disable runtime buffer fusing"); message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space."); message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in" "the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), " @@ -175,7 +176,8 @@ debug_configuration::debug_configuration() , serialize_compile(0) , max_kernels_per_batch(0) , disable_async_compilation(0) - , disable_dynamic_impl(0) { + , disable_dynamic_impl(0) + , disable_runtime_buffer_fusing(0) { #ifdef GPU_DEBUG_CONFIG get_gpu_debug_env_var("Help", help); get_common_debug_env_var("Verbose", verbose); @@ -205,6 +207,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch); get_gpu_debug_env_var("DisableAsyncCompilation", disable_async_compilation); get_gpu_debug_env_var("DisableDynamicImpl", disable_dynamic_impl); + get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing); std::string dump_iteration_str; get_gpu_debug_env_var("DumpIteration", dump_iteration_str); std::string mem_preallocation_params_str;