From 0d775269eaa2cbdbf9bdbc19477bc62d99fdfa7d Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mingyu.kim@intel.com>
Date: Fri, 9 Jul 2021 18:23:16 +0900
Subject: [PATCH] [GPU] Debug config improvement (#6575)

---
 .../api/cldnn/runtime/debug_configuration.hpp |  9 +-
 .../clDNN/runtime/debug_configuration.cpp     | 18 +++-
 .../thirdparty/clDNN/runtime/engine.cpp       |  5 +
 .../clDNN/runtime/ocl/ocl_event.cpp           | 12 +++
 .../thirdparty/clDNN/src/network.cpp          | 95 +++++++++++--------
 .../test_cases/debug_config_gpu_test.cpp      | 20 ++++
 6 files changed, 117 insertions(+), 42 deletions(-)
 create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp

diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp
index 469993a4691..c16eb97862f 100644
--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp
@@ -24,8 +24,13 @@ private:
     debug_configuration();
 public:
     static const char *prefix;
-    int verbose;
-    std::string dump_graphs;
+    int verbose;                    // Verbose execution
+    int print_multi_kernel_perf;    // Print execution time of each kernel in multi-kernel primitimive
+    int disable_usm;                // Disable usm usage
+    std::string dump_graphs;        // Dump optimized graph
+    std::string dump_layers_path;   // Enable dumping intermediate buffers and set the dest path
+    std::string dump_layers;        // Dump intermediate buffers of specified layers only, separated by space
+    int dump_layers_dst_only;       // Dump only output of layers
     static const debug_configuration *get_instance();
 };
 
diff --git a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
index cc7d64f71ec..4f07f5e1f09 100644
--- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
@@ -35,10 +35,26 @@ static void get_str_env(const std::string &var, std::string &val) {
 
 debug_configuration::debug_configuration()
         : verbose(0)
-        , dump_graphs(std::string()) {
+        , print_multi_kernel_perf(0)
+        , disable_usm(0)
+        , dump_graphs(std::string())
+        , dump_layers_path(std::string())
+        , dump_layers(std::string())
+        , dump_layers_dst_only(0) {
 #ifdef GPU_DEBUG_CONFIG
     get_int_env("OV_GPU_Verbose", verbose);
+    get_int_env("OV_GPU_PrintMultiKernelPerf", print_multi_kernel_perf);
+    get_int_env("OV_GPU_DisableUsm", disable_usm);
     get_str_env("OV_GPU_DumpGraphs", dump_graphs);
+    get_str_env("OV_GPU_DumpLayersPath", dump_layers_path);
+    get_str_env("OV_GPU_DumpLayers", dump_layers);
+    get_int_env("OV_GPU_DumpLayersDstOnly", dump_layers_dst_only);
+    if (dump_layers_path.length() > 0 && !disable_usm) {
+        disable_usm = 1;
+        GPU_DEBUG_COUT << "DisableUsm=1 because of DumpLayersPath" << std::endl;
+    }
+    if (dump_layers.length() > 0)
+        dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used
 #endif
 }
 
diff --git a/inference-engine/thirdparty/clDNN/runtime/engine.cpp b/inference-engine/thirdparty/clDNN/runtime/engine.cpp
index 561f38f00f4..d1c81a1fbd3 100644
--- a/inference-engine/thirdparty/clDNN/runtime/engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/engine.cpp
@@ -7,6 +7,7 @@
 #include "cldnn/runtime/memory.hpp"
 #include "cldnn/runtime/stream.hpp"
 #include "cldnn/runtime/device_query.hpp"
+#include "cldnn/runtime/debug_configuration.hpp"
 
 #include "ocl/ocl_engine_factory.hpp"
 
@@ -32,6 +33,10 @@ const device::ptr engine::get_device() const {
 }
 
 bool engine::use_unified_shared_memory() const {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->disable_usm) {
+        return false;
+    }
     if (_device->get_mem_caps().supports_usm() && _configuration.use_unified_shared_memory) {
         return true;
     }
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp
index 76e3c8f94ff..d40dbf7dfa1 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp
@@ -3,6 +3,7 @@
 //
 
 #include "ocl_event.hpp"
+#include "cldnn/runtime/debug_configuration.hpp"
 
 #include <cassert>
 #include <iostream>
@@ -175,6 +176,17 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
         for (auto& duration : all_durations[period.name]) {
             sum += (duration.second - duration.first);
         }
+
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        GPU_DEBUG_IF(debug_config->print_multi_kernel_perf) {
+            if (0 == strcmp(period.name, "executing")) {
+                GPU_DEBUG_COUT << "Multi-kernel time: ";
+                for (auto& duration : all_durations[period.name])
+                    std::cout << "  " << (duration.second - duration.first) / 1000;
+                std::cout << " Total " << sum / 1000 << std::endl;
+            }
+        }
+
         info.push_back(get_profiling_interval(period.name, 0, sum));
     }
 
diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp
index 0677ea5fbbe..a5ab53da745 100644
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@@ -33,15 +33,9 @@
 #include <utility>
 #include <map>
 
-// #define DEBUG_DUMP_PATH "cldnn_dump/"
-
-#ifdef DEBUG_DUMP_PATH
+#ifdef GPU_DEBUG_CONFIG
 #include <iomanip>
 #include <fstream>
-
-#define DUMP_VERBOSE 0
-#define DUMP_SINGLE_LAYER 0
-#define DUMP_LAYER_NAME ""
 #endif
 
 namespace cldnn {
@@ -131,7 +125,7 @@ std::map<primitive_id, network_output> network::execute(const std::vector<event:
     return result;
 }
 
-#ifdef DEBUG_DUMP_PATH
+#ifdef GPU_DEBUG_CONFIG
 static float convert_half_to_float(half_t val, bool flush_denorm_to_zero = false) {
 #if defined HALF_HALF_HPP
     return val;
@@ -180,6 +174,19 @@ float convert_element(float f) { return f; }
 
 float convert_element(half_t h) { return convert_half_to_float(h); }
 
+static size_t get_x_pitch(const layout& layout) {
+    try {
+        auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
+        auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
+        auto x0 = layout.get_linear_offset(tensor_x0);
+        auto x1 = layout.get_linear_offset(tensor_x1);
+        return (x1 - x0);
+    } catch (...) {
+        // When spatial size of x=0, x_pitch is meaningless
+        return 0;
+    }
+}
+
 template <class T>
 static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
     auto&& size = mem->get_layout().size;
@@ -189,6 +196,8 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
 
     mem_lock<T> lock(mem, stream);
     auto mem_ptr = lock.data();
+    auto x_pitch = get_x_pitch(mem->get_layout());
+    std::stringstream buffer;
 
     for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
         for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) {
@@ -196,10 +205,11 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
                 for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
                     for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
                         for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
-                            for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x) {
-                                cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, w));
-                                size_t input_it = mem->get_layout().get_linear_offset(t);
-                                file_stream << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
+                            cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
+                            size_t input_it = mem->get_layout().get_linear_offset(t);
+
+                            for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
+                                buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
                             }
                         }
                     }
@@ -207,6 +217,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
             }
         }
     }
+    file_stream << buffer.str();
 }
 template <>
 void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
@@ -238,12 +249,13 @@ void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream)
 }
 
 static void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
     std::string filename = layerName;
     std::replace(filename.begin(), filename.end(), '\\', '_');
     std::replace(filename.begin(), filename.end(), '/', '_');
     std::replace(filename.begin(), filename.end(), ' ', '_');
     std::replace(filename.begin(), filename.end(), ':', '_');
-        filename = DEBUG_DUMP_PATH + filename + ".txt";
+        filename = debug_config->dump_layers_path + filename + ".txt";
 
     std::ofstream file_stream(filename);
     auto mem_dt = mem->get_layout().data_type;
@@ -260,6 +272,12 @@ static void log_memory_to_file(memory::ptr mem, stream& stream, std::string laye
     else if (mem_dt == cldnn::data_types::u8)
         dump<uint8_t>(mem, stream, file_stream);
 }
+#else
+static void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) {
+    (void)mem;
+    (void)stream;
+    (void)layerName;
+}
 #endif
 /*
 Network_impl will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants
@@ -487,25 +505,24 @@ void network_impl::execute(const std::vector<event::ptr>& events) {
     set_arguments();
 
     for (auto& inst : _exec_order) {
-#ifdef DEBUG_DUMP_PATH
-        auto& node = _program->get_node(inst->id());
-
-        std::string layer_name = node.id();
-#if DUMP_VERBOSE
-        std::cerr << get_primitive_info(inst->id()) << std::endl;
-#endif
-#if DUMP_SINGLE_LAYER
-        if (layer_name == DUMP_LAYER_NAME) {
-#endif
-            std::cerr << "Dump " << layer_name << " layer" << std::endl;
-            for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
-                log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i), get_stream(),
-                                   layer_name + "_src_" + std::to_string(i));
+        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
+            auto& node = _program->get_node(inst->id());
+            std::string layer_name = node.id();
+            GPU_DEBUG_IF(debug_config->verbose >= 2) {
+                std::cerr << get_primitive_info(inst->id()) << std::endl;
+            }
+
+            GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 &&
+                            (debug_config->dump_layers.length() == 0 ||
+                            (debug_config->dump_layers.length() != 0 && debug_config->dump_layers.find(" " + layer_name + " ") != std::string::npos))) {
+                std::cout << "Dump " << layer_name << " layer src" << std::endl;
+                for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
+                    log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i), get_stream(),
+                                    layer_name + "_src_" + std::to_string(i));
+                }
             }
-#if DUMP_SINGLE_LAYER
         }
-#endif
-#endif
+
         GPU_DEBUG_IF(debug_config->verbose >= 1) {
             GPU_DEBUG_COUT << "Execute " << inst->id() << std::endl;
         }
@@ -517,16 +534,16 @@ void network_impl::execute(const std::vector<event::ptr>& events) {
         }
         execute_primitive(inst, events);
 
-#ifdef DEBUG_DUMP_PATH
-        get_stream().finish();
-#if DUMP_SINGLE_LAYER
-        if (layer_name == DUMP_LAYER_NAME)
-#endif
-        {
-            log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(), get_stream(), layer_name + "_dst_0");
+        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
+            get_stream().finish();
+            auto& node = _program->get_node(inst->id());
+            std::string layer_name = node.id();
+            GPU_DEBUG_IF(debug_config->dump_layers.length() == 0 ||
+                        (debug_config->dump_layers.length() != 0 && debug_config->dump_layers.find(" " + layer_name + " ") != std::string::npos)) {
+                std::cout << "Dump " << layer_name << " layer dst" << std::endl;
+                log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(), get_stream(), layer_name + "_dst_0");
+            }
         }
-
-#endif
     }
 
     for (auto& inst : _program->get_processing_order()) {
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp
new file mode 100644
index 00000000000..4f9fe3d603d
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "test_utils/test_utils.h"
+#include "cldnn/runtime/debug_configuration.hpp"
+
+using namespace cldnn;
+using namespace ::tests;
+
+TEST(debug_config_test, check_debug_config_off_on_release) {
+#ifdef NDEBUG
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(1) {
+        GTEST_FAIL();   /* This should be disabled in case of release build */
+    }
+#endif
+}