From 730c3f8f2545738f817fa4feda1a844a6cc582be Mon Sep 17 00:00:00 2001
From: "Min, Byungil" <byungil.min@intel.com>
Date: Wed, 29 Jun 2022 17:41:13 +0900
Subject: [PATCH] [GPU] Update Debug config for GPU plugin (#11983)

+ Added OV_GPU_DumpLayersResult
+ Applied minor update

Signed-off-by: Min, Byungil <byungil.min@intel.com>
---
 .../intel_gpu/runtime/debug_configuration.hpp  |  3 ++-
 .../intel_gpu/src/graph/layout_optimizer.cpp   |  3 ++-
 src/plugins/intel_gpu/src/graph/network.cpp    |  2 +-
 .../src/runtime/debug_configuration.cpp        | 18 +++++++++++++-----
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index d921c9646bf..0dab5038c38 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -36,13 +36,14 @@ public:
     std::vector<std::string> dump_layers;        // Dump intermediate buffers of specified layers only
     std::string dry_run_path;       // Dry run and serialize execution graph into the specified path
     int dump_layers_dst_only;       // Dump only output of layers
+    int dump_layers_result;         // Dump result layers
     int dump_layers_limit_batch;    // Limit the size of batch to dump
     int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
     std::vector<std::string> after_proc; // Start inference after the listed processes
     int serialize_compile;          // Serialize creating primitives and compiling kernels
     std::string forced_impl_type; // Force implementation type either ocl or onednn
     static const debug_configuration *get_instance();
-    bool is_dumped_layer(const std::string& layerName) const;
+    bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index e8f7b1f0365..3b9132b5a1e 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -1405,7 +1405,8 @@ impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node)
                 preferred_type = impl_types::cpu;
 
             if (node.id() == forced_impl_type.substr(0, found_type)) {
-                std::cout << "  >>> " << forced_impl_type.substr(0, found_type) << " : " << forced_impl_type.substr(found_type + 1) << std::endl;
+                GPU_DEBUG_COUT << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
+                    << forced_impl_type.substr(found_type + 1) << std::endl;
                 return preferred_type;
             }
         }
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 23038ab507b..ba873f180e9 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -694,7 +694,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
             get_stream().finish();
             auto& node = _program->get_node(inst->id());
             const std::string layer_name = node.id();
-            GPU_DEBUG_IF(debug_config->is_dumped_layer(layer_name)) {
+            GPU_DEBUG_IF(debug_config->is_dumped_layer(layer_name, node.is_output())) {
                 log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(), get_stream(), layer_name + "_dst_0");
             }
         }
diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
index 96e13957a1f..43c4adb1369 100644
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -110,6 +110,7 @@ static void print_help_messages() {
     message_list.emplace_back("OV_GPU_DumpSources", "Dump opencl sources");
     message_list.emplace_back("OV_GPU_DumpLayersPath", "Enable dumping intermediate buffers and set the dest path");
     message_list.emplace_back("OV_GPU_DumpLayers", "Dump intermediate buffers of specified layers only, separated by space");
+    message_list.emplace_back("OV_GPU_DumpLayersResult", "Dump output buffers of result layers only");
     message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
     message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
     message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
@@ -117,8 +118,8 @@ static void print_help_messages() {
     message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space."
                               " Supported on only on linux.");
     message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
-    message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type, either ocl or onednn, of a target primitive. [primitive]:[impl_type]"
-                              " Currently, only fc:onednn and fc:cldnn are supported.");
+    message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
+                              "For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:ocl and reduce:onednn are supported");
 
     auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
         [](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
@@ -143,6 +144,7 @@ debug_configuration::debug_configuration()
         , dump_sources(std::string())
         , dump_layers_path(std::string())
         , dump_layers_dst_only(0)
+        , dump_layers_result(0)
         , dry_run_path(std::string())
         , disable_onednn(0)
         , dump_layers_limit_batch(std::numeric_limits<int>::max())
@@ -157,8 +159,9 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("DumpGraphs", dump_graphs);
     get_gpu_debug_env_var("DumpSources", dump_sources);
     get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
-    get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
     get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
+    get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
+    get_gpu_debug_env_var("DumpLayersResult", dump_layers_result);
     get_gpu_debug_env_var("DisableOnednn", disable_onednn);
     get_gpu_debug_env_var("DryRunPath", dry_run_path);
     get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation);
@@ -211,9 +214,14 @@ const debug_configuration *debug_configuration::get_instance() {
 #endif
 }
 
-bool debug_configuration::is_dumped_layer(const std::string& layerName) const {
+bool debug_configuration::is_dumped_layer(const std::string& layerName, bool is_output) const {
 #ifdef GPU_DEBUG_CONFIG
-    if (dump_layers.empty()) return true;
+    if (is_output == true && dump_layers_result == 1 &&
+        (layerName.find("constant:") == std::string::npos))
+        return true;
+    if (dump_layers.empty() && dump_layers_result == 0)
+        return true;
+
     auto iter = std::find_if(dump_layers.begin(), dump_layers.end(), [&](const std::string& dl){
         return (layerName.compare(dl) == 0);
     });