From 0d775269eaa2cbdbf9bdbc19477bc62d99fdfa7d Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Fri, 9 Jul 2021 18:23:16 +0900 Subject: [PATCH] [GPU] Debug config improvement (#6575) --- .../api/cldnn/runtime/debug_configuration.hpp | 9 +- .../clDNN/runtime/debug_configuration.cpp | 18 +++- .../thirdparty/clDNN/runtime/engine.cpp | 5 + .../clDNN/runtime/ocl/ocl_event.cpp | 12 +++ .../thirdparty/clDNN/src/network.cpp | 95 +++++++++++-------- .../test_cases/debug_config_gpu_test.cpp | 20 ++++ 6 files changed, 117 insertions(+), 42 deletions(-) create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp index 469993a4691..c16eb97862f 100644 --- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp +++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp @@ -24,8 +24,13 @@ private: debug_configuration(); public: static const char *prefix; - int verbose; - std::string dump_graphs; + int verbose; // Verbose execution + int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive + int disable_usm; // Disable usm usage + std::string dump_graphs; // Dump optimized graph + std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path + std::string dump_layers; // Dump intermediate buffers of specified layers only, separated by space + int dump_layers_dst_only; // Dump only output of layers static const debug_configuration *get_instance(); }; diff --git a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp index cc7d64f71ec..4f07f5e1f09 100644 --- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp @@ -35,10 +35,26 @@ static void get_str_env(const std::string &var, std::string &val) { debug_configuration::debug_configuration() : verbose(0) - , dump_graphs(std::string()) { + , print_multi_kernel_perf(0) + , disable_usm(0) + , dump_graphs(std::string()) + , dump_layers_path(std::string()) + , dump_layers(std::string()) + , dump_layers_dst_only(0) { #ifdef GPU_DEBUG_CONFIG get_int_env("OV_GPU_Verbose", verbose); + get_int_env("OV_GPU_PrintMultiKernelPerf", print_multi_kernel_perf); + get_int_env("OV_GPU_DisableUsm", disable_usm); get_str_env("OV_GPU_DumpGraphs", dump_graphs); + get_str_env("OV_GPU_DumpLayersPath", dump_layers_path); + get_str_env("OV_GPU_DumpLayers", dump_layers); + get_int_env("OV_GPU_DumpLayersDstOnly", dump_layers_dst_only); + if (dump_layers_path.length() > 0 && !disable_usm) { + disable_usm = 1; + GPU_DEBUG_COUT << "DisableUsm=1 because of DumpLayersPath" << std::endl; + } + if (dump_layers.length() > 0) + dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used #endif } diff --git a/inference-engine/thirdparty/clDNN/runtime/engine.cpp b/inference-engine/thirdparty/clDNN/runtime/engine.cpp index 561f38f00f4..d1c81a1fbd3 100644 --- a/inference-engine/thirdparty/clDNN/runtime/engine.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/engine.cpp @@ -7,6 +7,7 @@ #include "cldnn/runtime/memory.hpp" #include "cldnn/runtime/stream.hpp" #include "cldnn/runtime/device_query.hpp" +#include "cldnn/runtime/debug_configuration.hpp" #include "ocl/ocl_engine_factory.hpp" @@ -32,6 +33,10 @@ const device::ptr engine::get_device() const { } bool engine::use_unified_shared_memory() const { + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->disable_usm) { + return false; + } if (_device->get_mem_caps().supports_usm() && _configuration.use_unified_shared_memory) { return true; } diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp index 76e3c8f94ff..d40dbf7dfa1 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_event.cpp @@ -3,6 +3,7 @@ // #include "ocl_event.hpp" +#include "cldnn/runtime/debug_configuration.hpp" #include #include @@ -175,6 +176,17 @@ bool ocl_events::get_profiling_info_impl(std::listprint_multi_kernel_perf) { + if (0 == strcmp(period.name, "executing")) { + GPU_DEBUG_COUT << "Multi-kernel time: "; + for (auto& duration : all_durations[period.name]) + std::cout << " " << (duration.second - duration.first) / 1000; + std::cout << " Total " << sum / 1000 << std::endl; + } + } + info.push_back(get_profiling_interval(period.name, 0, sum)); } diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp index 0677ea5fbbe..a5ab53da745 100644 --- a/inference-engine/thirdparty/clDNN/src/network.cpp +++ b/inference-engine/thirdparty/clDNN/src/network.cpp @@ -33,15 +33,9 @@ #include #include -// #define DEBUG_DUMP_PATH "cldnn_dump/" - -#ifdef DEBUG_DUMP_PATH +#ifdef GPU_DEBUG_CONFIG #include #include - -#define DUMP_VERBOSE 0 -#define DUMP_SINGLE_LAYER 0 -#define DUMP_LAYER_NAME "" #endif namespace cldnn { @@ -131,7 +125,7 @@ std::map network::execute(const std::vector static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { auto&& size = mem->get_layout().size; @@ -189,6 +196,8 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { mem_lock lock(mem, stream); auto mem_ptr = lock.data(); + auto x_pitch = get_x_pitch(mem->get_layout()); + std::stringstream buffer; for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) { for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) { @@ -196,10 +205,11 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) { for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) { for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) { - for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x) { - cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, w)); - size_t input_it = mem->get_layout().get_linear_offset(t); - file_stream << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl; + cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w)); + size_t input_it = mem->get_layout().get_linear_offset(t); + + for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) { + buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl; } } } @@ -207,6 +217,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { } } } + file_stream << buffer.str(); } template <> void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { @@ -238,12 +249,13 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) } static void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) { + GPU_DEBUG_GET_INSTANCE(debug_config); std::string filename = layerName; std::replace(filename.begin(), filename.end(), '\\', '_'); std::replace(filename.begin(), filename.end(), '/', '_'); std::replace(filename.begin(), filename.end(), ' ', '_'); std::replace(filename.begin(), filename.end(), ':', '_'); - filename = DEBUG_DUMP_PATH + filename + ".txt"; + filename = debug_config->dump_layers_path + filename + ".txt"; std::ofstream file_stream(filename); auto mem_dt = mem->get_layout().data_type; @@ -260,6 +272,12 @@ static void log_memory_to_file(memory::ptr mem, stream& stream, std::string laye else if (mem_dt == cldnn::data_types::u8) dump(mem, stream, file_stream); } +#else +static void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) { + (void)mem; + (void)stream; + (void)layerName; +} #endif /* Network_impl will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants @@ -487,25 +505,24 @@ void network_impl::execute(const std::vector& events) { set_arguments(); for (auto& inst : _exec_order) { -#ifdef DEBUG_DUMP_PATH - auto& node = _program->get_node(inst->id()); - - std::string layer_name = node.id(); -#if DUMP_VERBOSE - std::cerr << get_primitive_info(inst->id()) << std::endl; -#endif -#if DUMP_SINGLE_LAYER - if (layer_name == DUMP_LAYER_NAME) { -#endif - std::cerr << "Dump " << layer_name << " layer" << std::endl; - for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) { - log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i), get_stream(), - layer_name + "_src_" + std::to_string(i)); + GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) { + auto& node = _program->get_node(inst->id()); + std::string layer_name = node.id(); + GPU_DEBUG_IF(debug_config->verbose >= 2) { + std::cerr << get_primitive_info(inst->id()) << std::endl; + } + + GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 && + (debug_config->dump_layers.length() == 0 || + (debug_config->dump_layers.length() != 0 && debug_config->dump_layers.find(" " + layer_name + " ") != std::string::npos))) { + std::cout << "Dump " << layer_name << " layer src" << std::endl; + for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) { + log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i), get_stream(), + layer_name + "_src_" + std::to_string(i)); + } } -#if DUMP_SINGLE_LAYER } -#endif -#endif + GPU_DEBUG_IF(debug_config->verbose >= 1) { GPU_DEBUG_COUT << "Execute " << inst->id() << std::endl; } @@ -517,16 +534,16 @@ void network_impl::execute(const std::vector& events) { } execute_primitive(inst, events); -#ifdef DEBUG_DUMP_PATH - get_stream().finish(); -#if DUMP_SINGLE_LAYER - if (layer_name == DUMP_LAYER_NAME) -#endif - { - log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(), get_stream(), layer_name + "_dst_0"); + GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) { + get_stream().finish(); + auto& node = _program->get_node(inst->id()); + std::string layer_name = node.id(); + GPU_DEBUG_IF(debug_config->dump_layers.length() == 0 || + (debug_config->dump_layers.length() != 0 && debug_config->dump_layers.find(" " + layer_name + " ") != std::string::npos)) { + std::cout << "Dump " << layer_name << " layer dst" << std::endl; + log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(), get_stream(), layer_name + "_dst_0"); + } } - -#endif } for (auto& inst : _program->get_processing_order()) { diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp new file mode 100644 index 00000000000..4f9fe3d603d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/debug_config_gpu_test.cpp @@ -0,0 +1,20 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "test_utils/test_utils.h" +#include "cldnn/runtime/debug_configuration.hpp" + +using namespace cldnn; +using namespace ::tests; + +TEST(debug_config_test, check_debug_config_off_on_release) { +#ifdef NDEBUG + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(1) { + GTEST_FAIL(); /* This should be disabled in case of release build */ + } +#endif +}