[GPU] Add debug config for disabled primitive fusing (#18938)

* Add debug config for disabled primitive fusing * Apply comment * Fix failed TCs on CI
2023-08-08 10:02:48 +09:00 · 2023-08-08 10:02:48 +09:00 · a2807f1edb
commit a2807f1edb
parent d278ff5786
3 changed files with 54 additions and 37 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -94,40 +94,41 @@ private:

 public:
    static const char *prefix;
-    int help;                                       // Print help messages
-    int verbose;                                    // Verbose execution
-    int verbose_color;                              // Print verbose color
-    int list_layers;                                // Print list layers
-    int print_multi_kernel_perf;                    // Print execution time of each kernel in multi-kernel primitimive
-    int disable_usm;                                // Disable usm usage
-    int disable_onednn;                             // Disable onednn for discrete GPU (no effect for integrated GPU)
-    int disable_onednn_opt_post_ops;                // Disable onednn optimize post operators
-    std::string dump_profiling_data;                // Enables dump of extended performance profiling to specified dir
-    std::string dump_graphs;                        // Dump optimized graph
-    std::string dump_sources;                       // Dump opencl sources
-    std::string dump_layers_path;                   // Enable dumping intermediate buffers and set the dest path
-    std::vector<std::string> dump_layers;           // Dump intermediate buffers of specified layers only
-    std::string dry_run_path;                       // Dry run and serialize execution graph into the specified path
-    int dump_layers_dst_only;                       // Dump only output of layers
-    int dump_layers_result;                         // Dump result layers
-    int dump_layers_input;                          // Dump input layers
-    int dump_layers_limit_batch;                    // Limit the size of batch to dump
-    int dump_layers_raw;                            // Dump raw data.
-    int dump_layers_binary;                         // Dump binary data.
-    int dump_runtime_memory_pool;                   // Dump memory pool status at each iteration
-    int base_batch_for_memory_estimation;           // Base batch size to be used in memory estimation
-    std::vector<std::string> after_proc;            // Start inference after the listed processes
-    int serialize_compile;                          // Serialize creating primitives and compiling kernels
-    std::vector<std::string> forced_impl_types;     // Force implementation type either ocl or onednn
-    int max_kernels_per_batch;                      // Maximum number of kernels in a batch during compiling kernels
-    int disable_async_compilation;                  // Disable async compilation
-    int disable_dynamic_impl;                       // Disable dynamic implementation
-    int disable_runtime_buffer_fusing;              // Disable runtime buffer fusing
-    int disable_memory_reuse;                       // Disable memmory reuse among layers
-    int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
-    int disable_runtime_skip_reorder;               // Disable runtime skip reorder
-    std::set<int64_t> dump_iteration;               // Dump n-th execution of network.
-    std::vector<std::string> load_layers_raw_dump;  // List of layers to load dumped raw binary and filenames
+    int help;                                                   // Print help messages
+    int verbose;                                                // Verbose execution
+    int verbose_color;                                          // Print verbose color
+    int list_layers;                                            // Print list layers
+    int print_multi_kernel_perf;                                // Print execution time of each kernel in multi-kernel primitimive
+    int disable_usm;                                            // Disable usm usage
+    int disable_onednn;                                         // Disable onednn for discrete GPU (no effect for integrated GPU)
+    int disable_onednn_opt_post_ops;                            // Disable onednn optimize post operators
+    std::string dump_profiling_data;                            // Enables dump of extended performance profiling to specified dir
+    std::string dump_graphs;                                    // Dump optimized graph
+    std::string dump_sources;                                   // Dump opencl sources
+    std::string dump_layers_path;                               // Enable dumping intermediate buffers and set the dest path
+    std::vector<std::string> dump_layers;                       // Dump intermediate buffers of specified layers only
+    std::string dry_run_path;                                   // Dry run and serialize execution graph into the specified path
+    int dump_layers_dst_only;                                   // Dump only output of layers
+    int dump_layers_result;                                     // Dump result layers
+    int dump_layers_input;                                      // Dump input layers
+    int dump_layers_limit_batch;                                // Limit the size of batch to dump
+    int dump_layers_raw;                                        // Dump raw data.
+    int dump_layers_binary;                                     // Dump binary data.
+    int dump_runtime_memory_pool;                               // Dump memory pool status at each iteration
+    int base_batch_for_memory_estimation;                       // Base batch size to be used in memory estimation
+    std::vector<std::string> after_proc;                        // Start inference after the listed processes
+    int serialize_compile;                                      // Serialize creating primitives and compiling kernels
+    std::vector<std::string> forced_impl_types;                 // Force implementation type either ocl or onednn
+    int max_kernels_per_batch;                                  // Maximum number of kernels in a batch during compiling kernels
+    int disable_async_compilation;                              // Disable async compilation
+    int disable_dynamic_impl;                                   // Disable dynamic implementation
+    int disable_runtime_buffer_fusing;                          // Disable runtime buffer fusing
+    int disable_memory_reuse;                                   // Disable memmory reuse among layers
+    int disable_build_time_weight_reorder_for_dynamic_nodes;    // Disable build time weight reordering for dynamic nodes
+    int disable_runtime_skip_reorder;                           // Disable runtime skip reorder
+    int disable_primitive_fusing;                               // Disable primitive fusing
+    std::set<int64_t> dump_iteration;                           // Dump n-th execution of network.
+    std::vector<std::string> load_layers_raw_dump;              // List of layers to load dumped raw binary and filenames
    static const debug_configuration *get_instance();
    std::vector<std::string> get_filenames_for_matched_layer_loading_binaries(const std::string& id) const;
    std::string get_name_for_dump(const std::string& file_name) const;
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -543,11 +543,24 @@ void program::pre_optimize_graph(bool is_internal) {

    reorder_factory rf;
    if (optimize_data) {
-        apply_opt_pass<prepare_primitive_fusing_through>();
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+#ifdef GPU_DEBUG_CONFIG
+        GPU_DEBUG_IF(!debug_config->disable_primitive_fusing) {
+#else
+        {
+#endif
+            apply_opt_pass<prepare_primitive_fusing_through>();
+        }

        apply_opt_pass<pre_replace_deconv>(lo);

-        apply_opt_pass<prepare_primitive_fusing>(lo);
+#ifdef GPU_DEBUG_CONFIG
+        GPU_DEBUG_IF(!debug_config->disable_primitive_fusing) {
+#else
+        {
+#endif
+            apply_opt_pass<prepare_primitive_fusing>(lo);
+        }

        apply_opt_pass<select_preferred_formats>(lo);

--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -137,6 +137,7 @@ static void print_help_messages() {
    message_list.emplace_back("OV_GPU_DisableDynamicImpl", "Disable dynamic implementation");
    message_list.emplace_back("OV_GPU_DisableRuntimeBufferFusing", "Disable runtime buffer fusing");
    message_list.emplace_back("OV_GPU_DisableMemoryReuse", "Disable memory reuse");
+    message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing");
    message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
    message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in"
                              "the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
@ -191,7 +192,8 @@ debug_configuration::debug_configuration()
        , disable_runtime_buffer_fusing(0)
        , disable_memory_reuse(0)
        , disable_build_time_weight_reorder_for_dynamic_nodes(0)
-        , disable_runtime_skip_reorder(0) {
+        , disable_runtime_skip_reorder(0)
+        , disable_primitive_fusing(0) {
 #ifdef GPU_DEBUG_CONFIG
    get_gpu_debug_env_var("Help", help);
    get_common_debug_env_var("Verbose", verbose);
@ -228,6 +230,7 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
    get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
    get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
+    get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing);
    std::string dump_iteration_str;
    get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
    std::string mem_preallocation_params_str;