[GPU] Debug config improvement (#6575)

2021-07-09 18:23:16 +09:00 · 2021-07-09 18:23:16 +09:00 · 0d775269ea
commit 0d775269ea
parent 98148539b3
6 changed files with 117 additions and 42 deletions
--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp
@ -24,8 +24,13 @@ private:
    debug_configuration();
 public:
    static const char *prefix;
-    int verbose;
-    std::string dump_graphs;
+    int verbose;                    // Verbose execution
+    int print_multi_kernel_perf;    // Print execution time of each kernel in multi-kernel primitimive
+    int disable_usm;                // Disable usm usage
+    std::string dump_graphs;        // Dump optimized graph
+    std::string dump_layers_path;   // Enable dumping intermediate buffers and set the dest path
+    std::string dump_layers;        // Dump intermediate buffers of specified layers only, separated by space
+    int dump_layers_dst_only;       // Dump only output of layers
    static const debug_configuration *get_instance();
 };

--- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
@ -35,10 +35,26 @@ static void get_str_env(const std::string &var, std::string &val) {

 debug_configuration::debug_configuration()
        : verbose(0)
-        , dump_graphs(std::string()) {
+        , print_multi_kernel_perf(0)
+        , disable_usm(0)
+        , dump_graphs(std::string())
+        , dump_layers_path(std::string())
+        , dump_layers(std::string())
+        , dump_layers_dst_only(0) {
 #ifdef GPU_DEBUG_CONFIG
    get_int_env("OV_GPU_Verbose", verbose);
+    get_int_env("OV_GPU_PrintMultiKernelPerf", print_multi_kernel_perf);
+    get_int_env("OV_GPU_DisableUsm", disable_usm);
    get_str_env("OV_GPU_DumpGraphs", dump_graphs);
+    get_str_env("OV_GPU_DumpLayersPath", dump_layers_path);
+    get_str_env("OV_GPU_DumpLayers", dump_layers);
+    get_int_env("OV_GPU_DumpLayersDstOnly", dump_layers_dst_only);
+    if (dump_layers_path.length() > 0 && !disable_usm) {
+        disable_usm = 1;
+        GPU_DEBUG_COUT << "DisableUsm=1 because of DumpLayersPath" << std::endl;
+    }
+    if (dump_layers.length() > 0)
+        dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used
 #endif
 }

--- a/inference-engine/thirdparty/clDNN/runtime/engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/engine.cpp
@ -7,6 +7,7 @@
 #include "cldnn/runtime/memory.hpp"
 #include "cldnn/runtime/stream.hpp"
 #include "cldnn/runtime/device_query.hpp"
+#include "cldnn/runtime/debug_configuration.hpp"

 #include "ocl/ocl_engine_factory.hpp"

@ -32,6 +33,10 @@ const device::ptr engine::get_device() const {
 }

 bool engine::use_unified_shared_memory() const {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->disable_usm) {
+        return false;
+    }
    if (_device->get_mem_caps().supports_usm() && _configuration.use_unified_shared_memory) {
        return true;
    }
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp
@ -3,6 +3,7 @@
 //

 #include "ocl_event.hpp"
+#include "cldnn/runtime/debug_configuration.hpp"

 #include <cassert>
 #include <iostream>
@ -175,6 +176,17 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
        for (auto& duration : all_durations[period.name]) {
            sum += (duration.second - duration.first);
        }
+
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        GPU_DEBUG_IF(debug_config->print_multi_kernel_perf) {
+            if (0 == strcmp(period.name, "executing")) {
+                GPU_DEBUG_COUT << "Multi-kernel time: ";
+                for (auto& duration : all_durations[period.name])
+                    std::cout << "  " << (duration.second - duration.first) / 1000;
+                std::cout << " Total " << sum / 1000 << std::endl;
+            }
+        }
+
        info.push_back(get_profiling_interval(period.name, 0, sum));
    }

--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@ -33,15 +33,9 @@
 #include <utility>
 #include <map>

-// #define DEBUG_DUMP_PATH "cldnn_dump/"
-
-#ifdef DEBUG_DUMP_PATH
+#ifdef GPU_DEBUG_CONFIG
 #include <iomanip>
 #include <fstream>
-
-#define DUMP_VERBOSE 0
-#define DUMP_SINGLE_LAYER 0
-#define DUMP_LAYER_NAME ""
 #endif

 namespace cldnn {
@ -131,7 +125,7 @@ std::map<primitive_id, network_output> network::execute(const std::vector<event:
    return result;
 }

-#ifdef DEBUG_DUMP_PATH
+#ifdef GPU_DEBUG_CONFIG
 static float convert_half_to_float(half_t val, bool flush_denorm_to_zero = false) {
 #if defined HALF_HALF_HPP
    return val;
@ -180,6 +174,19 @@ float convert_element(float f) { return f; }

 float convert_element(half_t h) { return convert_half_to_float(h); }

+static size_t get_x_pitch(const layout& layout) {
+    try {
+        auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
+        auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
+        auto x0 = layout.get_linear_offset(tensor_x0);
+        auto x1 = layout.get_linear_offset(tensor_x1);
+        return (x1 - x0);
+    } catch (...) {
+        // When spatial size of x=0, x_pitch is meaningless
+        return 0;
+    }
+}
+
 template <class T>
 static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
    auto&& size = mem->get_layout().size;
@ -189,6 +196,8 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {

    mem_lock<T> lock(mem, stream);
    auto mem_ptr = lock.data();
+    auto x_pitch = get_x_pitch(mem->get_layout());
+    std::stringstream buffer;

    for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
        for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) {
@ -196,10 +205,11 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
                for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
                    for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
                        for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
-                            for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x) {
-                                cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, w));
+                            cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
                            size_t input_it = mem->get_layout().get_linear_offset(t);
-                                file_stream << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
+
+                            for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
+                                buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
                            }
                        }
                    }
@ -207,6 +217,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
            }
        }
    }
+    file_stream << buffer.str();
 }
 template <>
 void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
@ -238,12 +249,13 @@ void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream)
 }

 static void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
    std::string filename = layerName;
    std::replace(filename.begin(), filename.end(), '\\', '_');
    std::replace(filename.begin(), filename.end(), '/', '_');
    std::replace(filename.begin(), filename.end(), ' ', '_');
    std::replace(filename.begin(), filename.end(), ':', '_');
-        filename = DEBUG_DUMP_PATH + filename + ".txt";
+        filename = debug_config->dump_layers_path + filename + ".txt";

    std::ofstream file_stream(filename);
    auto mem_dt = mem->get_layout().data_type;
@ -260,6 +272,12 @@ static void log_memory_to_file(memory::ptr mem, stream& stream, std::string laye
    else if (mem_dt == cldnn::data_types::u8)
        dump<uint8_t>(mem, stream, file_stream);
 }
+#else
+static void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) {
+    (void)mem;
+    (void)stream;
+    (void)layerName;
+}
 #endif
 /*
 Network_impl will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants
@ -487,25 +505,24 @@ void network_impl::execute(const std::vector<event::ptr>& events) {
    set_arguments();

    for (auto& inst : _exec_order) {
-#ifdef DEBUG_DUMP_PATH
+        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
            auto& node = _program->get_node(inst->id());
-
            std::string layer_name = node.id();
-#if DUMP_VERBOSE
+            GPU_DEBUG_IF(debug_config->verbose >= 2) {
                std::cerr << get_primitive_info(inst->id()) << std::endl;
-#endif
-#if DUMP_SINGLE_LAYER
-        if (layer_name == DUMP_LAYER_NAME) {
-#endif
-            std::cerr << "Dump " << layer_name << " layer" << std::endl;
+            }
+
+            GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 &&
+                            (debug_config->dump_layers.length() == 0 ||
+                            (debug_config->dump_layers.length() != 0 && debug_config->dump_layers.find(" " + layer_name + " ") != std::string::npos))) {
+                std::cout << "Dump " << layer_name << " layer src" << std::endl;
                for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
                    log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i), get_stream(),
                                    layer_name + "_src_" + std::to_string(i));
                }
-#if DUMP_SINGLE_LAYER
            }
-#endif
-#endif
+        }
+
        GPU_DEBUG_IF(debug_config->verbose >= 1) {
            GPU_DEBUG_COUT << "Execute " << inst->id() << std::endl;
        }
@ -517,16 +534,16 @@ void network_impl::execute(const std::vector<event::ptr>& events) {
        }
        execute_primitive(inst, events);

-#ifdef DEBUG_DUMP_PATH
+        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
            get_stream().finish();
-#if DUMP_SINGLE_LAYER
-        if (layer_name == DUMP_LAYER_NAME)
-#endif
-        {
+            auto& node = _program->get_node(inst->id());
+            std::string layer_name = node.id();
+            GPU_DEBUG_IF(debug_config->dump_layers.length() == 0 ||
+                        (debug_config->dump_layers.length() != 0 && debug_config->dump_layers.find(" " + layer_name + " ") != std::string::npos)) {
+                std::cout << "Dump " << layer_name << " layer dst" << std::endl;
                log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(), get_stream(), layer_name + "_dst_0");
            }
-
-#endif
+        }
    }

    for (auto& inst : _program->get_processing_order()) {
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp
@ -0,0 +1,20 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "test_utils/test_utils.h"
+#include "cldnn/runtime/debug_configuration.hpp"
+
+using namespace cldnn;
+using namespace ::tests;
+
+TEST(debug_config_test, check_debug_config_off_on_release) {
+#ifdef NDEBUG
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(1) {
+        GTEST_FAIL();   /* This should be disabled in case of release build */
+    }
+#endif
+}