From a4518ae5953832d8b4cb239744f14138e3afbb6e Mon Sep 17 00:00:00 2001
From: Jade Cho <jade.cho@intel.com>
Date: Mon, 20 Dec 2021 14:19:53 +0900
Subject: [PATCH] [GPU] Add DumpLayersLimitBatch config to GPU debug utils.
 (#9196)

* Add DumpLayersLimitBatch config to GPU debug utils.

+ Support OV_GPU_Help config
+ Only run first inference if OV_GPU_DumpLayersPath is set.
+ Fix dump graph bug.

* Apply some comments

* Remove unnecessary code.
---
 .../intel_gpu/runtime/debug_configuration.hpp |  2 +
 .../clDNN/runtime/debug_configuration.cpp     | 41 ++++++++++++++++++-
 .../graph_optimizer/graph_initializations.cpp |  5 ---
 .../thirdparty/clDNN/src/network.cpp          | 16 ++++++--
 .../thirdparty/clDNN/src/program.cpp          |  7 +++-
 .../intel_gpu/src/plugin/infer_request.cpp    |  7 ++++
 6 files changed, 68 insertions(+), 10 deletions(-)
diff --git a/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp b/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp
index d96c151e5fe..600a322491b 100644
--- a/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp
+++ b/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp
@@ -24,6 +24,7 @@ private:
     debug_configuration();
 public:
     static const char *prefix;
+    int help;                       // Print help messages
     int verbose;                    // Verbose execution
     int print_multi_kernel_perf;    // Print execution time of each kernel in multi-kernel primitimive
     int disable_usm;                // Disable usm usage
@@ -34,6 +35,7 @@ public:
     std::string dump_layers;        // Dump intermediate buffers of specified layers only, separated by space
     std::string dry_run_path;       // Dry run and serialize execution graph into the specified path
     int dump_layers_dst_only;       // Dump only output of layers
+    int dump_layers_limit_batch;    // Limit the size of batch to dump
     int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
     static const debug_configuration *get_instance();
 };
diff --git a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
index 33ecf9625ba..2a387b39515 100644
--- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
@@ -3,7 +3,9 @@
 //
 
 #include "intel_gpu/runtime/debug_configuration.hpp"
+#include <algorithm>
 #include <iostream>
+#include <iomanip>
 #include <memory>
 #include <vector>
 #include <sstream>
@@ -97,10 +99,39 @@ void get_common_debug_env_var(const std::string &var, T &val) {
     return get_debug_env_var(var, val, allowed_option_prefixes);
 }
 
+static void print_help_messages() {
+    std::vector<std::pair<std::string, std::string>> message_list;
+    message_list.emplace_back("OV_GPU_Help", "Print help messages");
+    message_list.emplace_back("OV_GPU_Verbose", "Verbose execution");
+    message_list.emplace_back("OV_GPU_PrintMultiKernelPerf", "Print execution time of each kernel in multi-kernel primitimive");
+    message_list.emplace_back("OV_GPU_DisableUsm", "Disable usm usage");
+    message_list.emplace_back("OV_GPU_DisableOnednn", "Disable onednn for discrete GPU (no effect for integrated GPU)");
+    message_list.emplace_back("OV_GPU_DumpGraphs", "Dump optimized graph");
+    message_list.emplace_back("OV_GPU_DumpSources", "Dump opencl sources");
+    message_list.emplace_back("OV_GPU_DumpLayersPath", "Enable dumping intermediate buffers and set the dest path");
+    message_list.emplace_back("OV_GPU_DumpLayers", "Dump intermediate buffers of specified layers only, separated by space");
+    message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
+    message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
+    message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
+    message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
+
+    auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
+        [](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
+            return a.first.size() < b.first.size();
+    });
+    int name_width = static_cast<int>(max_name_length_item->first.size()) + 2;
+
+    GPU_DEBUG_COUT << "Supported environment variables for debugging" << std::endl;
+    for (auto& p : message_list) {
+        GPU_DEBUG_COUT << " - " << std::left << std::setw(name_width) << p.first + ": " << p.second << std::endl;
+    }
+}
+
 #endif
 
 debug_configuration::debug_configuration()
-        : verbose(0)
+        : help(0)
+        , verbose(0)
         , print_multi_kernel_perf(0)
         , disable_usm(0)
         , dump_graphs(std::string())
@@ -110,8 +141,10 @@ debug_configuration::debug_configuration()
         , dump_layers_dst_only(0)
         , dry_run_path(std::string())
         , disable_onednn(0)
+        , dump_layers_limit_batch(std::numeric_limits<int>::max())
         , base_batch_for_memory_estimation(-1) {
 #ifdef GPU_DEBUG_CONFIG
+    get_gpu_debug_env_var("Help", help);
     get_common_debug_env_var("Verbose", verbose);
     get_gpu_debug_env_var("PrintMultiKernelPerf", print_multi_kernel_perf);
     get_gpu_debug_env_var("DisableUsm", disable_usm);
@@ -120,10 +153,16 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
     get_gpu_debug_env_var("DumpLayers", dump_layers);
     get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
+    get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
     get_gpu_debug_env_var("DisableOnednn", disable_onednn);
     get_gpu_debug_env_var("DryRunPath", dry_run_path);
     get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation);
 
+    if (help > 0) {
+        print_help_messages();
+        exit(0);
+    }
+
     if (dump_layers.length() > 0)
         dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used
 #endif
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
index f579819ac9f..0a1ba4156b7 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
@@ -436,10 +436,5 @@ void graph_initializations::run(program& p) {
     }
     set_outputs(p);
     p.get_processing_order().calc_processing_order(p);
-
-    for (auto& node : p.get_processing_order()) {
-        if (!node->is_type<data>())
-            node->get_output_layout();
-    }
 }
 }  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp
index febc8d61bce..29210058d6b 100644
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@@ -110,8 +110,18 @@ template <class T>
 static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
     auto&& size = mem->get_layout().size;
 
-    file_stream << "shape: " << size.to_string() << " ";
-    file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
+    tensor tmp_size(size);
+    tmp_size.batch[0] = batch_size;
+    if (tmp_size == size) {
+        file_stream << "shape: " << size.to_string() << " ";
+        file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
+    } else {
+        file_stream << "shape: " << tmp_size.to_string() << " ";
+        file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
+            << ", original shape: " << size.to_string() << ")" << std::endl;
+    }
 
     mem_lock<T, mem_lock_type::read> lock(mem, stream);
     auto mem_ptr = lock.data();
@@ -119,7 +129,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
     std::stringstream buffer;
 
     for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
-        for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) {
+        for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
             for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
                 for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
                     for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp
index ebdc3f3920e..d4cb50f41c5 100644
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -1225,8 +1225,13 @@ program::primitives_info program::get_current_stage_info() const {
 
 void program::save_pass_info(std::string pass_name) {
     // TODO: Directory path here can be probably changed to some bool flag
-    if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty())
+    if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty()) {
+        for (auto& node : this->get_processing_order()) {
+            if (!node->is_type<data>())
+                node->get_output_layout();
+        }
         optimizer_passes_info.emplace_back(pass_name, get_current_stage_info());
+    }
 }
 
 void program::add_optimized_primitive_info(primitive_id optimized_primitive_id,
diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp
index 4801fb9a93f..2fc215403c3 100644
--- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp
@@ -557,6 +557,13 @@ void InferRequest::enqueue() {
 
     internal_outputs.clear();
     internal_outputs = m_graph->GetNetwork()->execute(dependencies);
+
+    // If dump layers path is set, only runs first inference.
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
+        GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
+        exit(0);
+    }
 }
 
 void InferRequest::wait_notify() {