Add Debug Config for maximum kernels per batch (#12068)

Signed-off-by: Andrew Park <andrew.park@intel.com>
2022-07-07 20:26:51 +09:00 · 2022-07-07 20:26:51 +09:00 · 32937ab7ca
commit 32937ab7ca
parent cd6c7da91c
3 changed files with 27 additions and 18 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -23,25 +23,27 @@ namespace cldnn {
 class debug_configuration {
 private:
    debug_configuration();
+
 public:
    static const char *prefix;
-    int help;                       // Print help messages
-    int verbose;                    // Verbose execution
-    int print_multi_kernel_perf;    // Print execution time of each kernel in multi-kernel primitimive
-    int disable_usm;                // Disable usm usage
-    int disable_onednn;             // Disable onednn for discrete GPU (no effect for integrated GPU)
-    std::string dump_graphs;        // Dump optimized graph
-    std::string dump_sources;       // Dump opencl sources
-    std::string dump_layers_path;   // Enable dumping intermediate buffers and set the dest path
-    std::vector<std::string> dump_layers;        // Dump intermediate buffers of specified layers only
-    std::string dry_run_path;       // Dry run and serialize execution graph into the specified path
-    int dump_layers_dst_only;       // Dump only output of layers
-    int dump_layers_result;         // Dump result layers
-    int dump_layers_limit_batch;    // Limit the size of batch to dump
-    int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
-    std::vector<std::string> after_proc; // Start inference after the listed processes
-    int serialize_compile;          // Serialize creating primitives and compiling kernels
-    std::string forced_impl_type; // Force implementation type either ocl or onednn
+    int help;                               // Print help messages
+    int verbose;                            // Verbose execution
+    int print_multi_kernel_perf;            // Print execution time of each kernel in multi-kernel primitimive
+    int disable_usm;                        // Disable usm usage
+    int disable_onednn;                     // Disable onednn for discrete GPU (no effect for integrated GPU)
+    std::string dump_graphs;                // Dump optimized graph
+    std::string dump_sources;               // Dump opencl sources
+    std::string dump_layers_path;           // Enable dumping intermediate buffers and set the dest path
+    std::vector<std::string> dump_layers;   // Dump intermediate buffers of specified layers only
+    std::string dry_run_path;               // Dry run and serialize execution graph into the specified path
+    int dump_layers_dst_only;               // Dump only output of layers
+    int dump_layers_result;                 // Dump result layers
+    int dump_layers_limit_batch;            // Limit the size of batch to dump
+    int base_batch_for_memory_estimation;   // Base batch size to be used in memory estimation
+    std::vector<std::string> after_proc;    // Start inference after the listed processes
+    int serialize_compile;                  // Serialize creating primitives and compiling kernels
+    std::string forced_impl_type;           // Force implementation type either ocl or onednn
+    int max_kernels_per_batch;              // Maximum number of kernels in a batch during compiling kernels
    static const debug_configuration *get_instance();
    bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
 };
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -120,6 +120,7 @@ static void print_help_messages() {
    message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
    message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
                              "For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:ocl and reduce:onednn are supported");
+    message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");

    auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
        [](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
@ -150,7 +151,8 @@ debug_configuration::debug_configuration()
        , dump_layers_limit_batch(std::numeric_limits<int>::max())
        , base_batch_for_memory_estimation(-1)
        , serialize_compile(0)
-        , forced_impl_type(std::string()) {
+        , forced_impl_type(std::string())
+        , max_kernels_per_batch(0) {
 #ifdef GPU_DEBUG_CONFIG
    get_gpu_debug_env_var("Help", help);
    get_common_debug_env_var("Verbose", verbose);
@ -171,6 +173,7 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("AfterProc", after_proc_str);
    get_gpu_debug_env_var("SerialCompile", serialize_compile);
    get_gpu_debug_env_var("ForceImplType", forced_impl_type);
+    get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);

    if (help > 0) {
        print_help_messages();
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@ -148,6 +148,10 @@ bool kernels_cache::is_cache_enabled() const {
 }

 size_t kernels_cache::get_max_kernels_per_batch() const {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->max_kernels_per_batch >= 1) {
+        return static_cast<size_t>(debug_config->max_kernels_per_batch);
+    }
    return 8;
 }