Add new debug config "DisableRuntimeBufferFusing" (#18726)

2023-07-24 14:56:21 -07:00 · 2023-07-24 14:56:21 -07:00 · ce729761d6
commit ce729761d6
parent f70ef8be5b
4 changed files with 16 additions and 3 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -119,6 +119,7 @@ public:
    int max_kernels_per_batch;                  // Maximum number of kernels in a batch during compiling kernels
    int disable_async_compilation;              // Disable async compilation
    int disable_dynamic_impl;                   // Disable dynamic implementation
    int disable_runtime_buffer_fusing;          // Disable runtime buffer fusing
    std::set<int64_t> dump_iteration;           // Dump n-th execution of network.
    static const debug_configuration *get_instance();
    bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@ -69,6 +69,11 @@ bool concat_in_place_optimization::match(const program_node& concat_node,
                                         bool is_runtime) {
    if (concat_node.is_output() || concat_params.fused_desc.size() > 0 || concat_node.is_in_shape_of_subgraph())
        return false;
    bool do_runtime_buffer_fusing = true;
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
        do_runtime_buffer_fusing = false;
    }
    auto pred_nodes = concat_node.get_dependencies();
    for (auto p : pred_nodes) {
        // TODO : In dynamic shape only one user is allowed for optimzied concat
@ -79,9 +84,9 @@ bool concat_in_place_optimization::match(const program_node& concat_node,
        // for simple patterns where the concat is the only user of all the preds.
        // Also cascaded concat is not handled for dynamic shape. for now.
        // If we have more flexible exec order handling in the future we'll be able to remove this condition below
-        if (p.first->is_dynamic() && p.first->get_users().size() > 1)
+        if (p.first->is_dynamic() && (!do_runtime_buffer_fusing || p.first->get_users().size() > 1))
            return false;
-        if (concat_node.is_dynamic() && !p.first->is_dynamic())
+        if (concat_node.is_dynamic() && (!do_runtime_buffer_fusing || !p.first->is_dynamic()))
            return false;
    }
    // if this is called in primitive_inst::execute() and concat is static, that concat should already be optimized in build time, not in runtime.
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -625,6 +625,10 @@ bool primitive_inst::update_impl() {
 }
 void primitive_inst::do_runtime_in_place_concat() {
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
        return;
    }
    if (update_shape_done_by_other)
        return;
    if (get_users().size() != 1) return;
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -133,6 +133,7 @@ static void print_help_messages() {
    message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
    message_list.emplace_back("OV_GPU_DisableAsyncCompilation", "Disable async compilation");
    message_list.emplace_back("OV_GPU_DisableDynamicImpl", "Disable dynamic implementation");
    message_list.emplace_back("OV_GPU_DisableRuntimeBufferFusing", "Disable runtime buffer fusing");
    message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
    message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in"
                              "the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
@ -175,7 +176,8 @@ debug_configuration::debug_configuration()
        , serialize_compile(0)
        , max_kernels_per_batch(0)
        , disable_async_compilation(0)
-        , disable_dynamic_impl(0) {
+        , disable_dynamic_impl(0)
        , disable_runtime_buffer_fusing(0) {
 #ifdef GPU_DEBUG_CONFIG
    get_gpu_debug_env_var("Help", help);
    get_common_debug_env_var("Verbose", verbose);
@ -205,6 +207,7 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);
    get_gpu_debug_env_var("DisableAsyncCompilation", disable_async_compilation);
    get_gpu_debug_env_var("DisableDynamicImpl", disable_dynamic_impl);
    get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
    std::string dump_iteration_str;
    get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
    std::string mem_preallocation_params_str;