From a4518ae5953832d8b4cb239744f14138e3afbb6e Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Mon, 20 Dec 2021 14:19:53 +0900 Subject: [PATCH] [GPU] Add DumpLayersLimitBatch config to GPU debug utils. (#9196) * Add DumpLayersLimitBatch config to GPU debug utils. + Support OV_GPU_Help config + Only run first inference if OV_GPU_DumpLayersPath is set. + Fix dump graph bug. * Apply some comments * Remove unnecessary code. --- .../intel_gpu/runtime/debug_configuration.hpp | 2 + .../clDNN/runtime/debug_configuration.cpp | 41 ++++++++++++++++++- .../graph_optimizer/graph_initializations.cpp | 5 --- .../thirdparty/clDNN/src/network.cpp | 16 ++++++-- .../thirdparty/clDNN/src/program.cpp | 7 +++- .../intel_gpu/src/plugin/infer_request.cpp | 7 ++++ 6 files changed, 68 insertions(+), 10 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp b/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp index d96c151e5fe..600a322491b 100644 --- a/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp +++ b/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp @@ -24,6 +24,7 @@ private: debug_configuration(); public: static const char *prefix; + int help; // Print help messages int verbose; // Verbose execution int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive int disable_usm; // Disable usm usage @@ -34,6 +35,7 @@ public: std::string dump_layers; // Dump intermediate buffers of specified layers only, separated by space std::string dry_run_path; // Dry run and serialize execution graph into the specified path int dump_layers_dst_only; // Dump only output of layers + int dump_layers_limit_batch; // Limit the size of batch to dump int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation static const debug_configuration *get_instance(); }; diff --git a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp index 33ecf9625ba..2a387b39515 100644 --- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp @@ -3,7 +3,9 @@ // #include "intel_gpu/runtime/debug_configuration.hpp" +#include #include +#include #include #include #include @@ -97,10 +99,39 @@ void get_common_debug_env_var(const std::string &var, T &val) { return get_debug_env_var(var, val, allowed_option_prefixes); } +static void print_help_messages() { + std::vector> message_list; + message_list.emplace_back("OV_GPU_Help", "Print help messages"); + message_list.emplace_back("OV_GPU_Verbose", "Verbose execution"); + message_list.emplace_back("OV_GPU_PrintMultiKernelPerf", "Print execution time of each kernel in multi-kernel primitimive"); + message_list.emplace_back("OV_GPU_DisableUsm", "Disable usm usage"); + message_list.emplace_back("OV_GPU_DisableOnednn", "Disable onednn for discrete GPU (no effect for integrated GPU)"); + message_list.emplace_back("OV_GPU_DumpGraphs", "Dump optimized graph"); + message_list.emplace_back("OV_GPU_DumpSources", "Dump opencl sources"); + message_list.emplace_back("OV_GPU_DumpLayersPath", "Enable dumping intermediate buffers and set the dest path"); + message_list.emplace_back("OV_GPU_DumpLayers", "Dump intermediate buffers of specified layers only, separated by space"); + message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers"); + message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump"); + message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path"); + message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation"); + + auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(), + [](std::pair& a, std::pair& b){ + return a.first.size() < b.first.size(); + }); + int name_width = static_cast(max_name_length_item->first.size()) + 2; + + GPU_DEBUG_COUT << "Supported environment variables for debugging" << std::endl; + for (auto& p : message_list) { + GPU_DEBUG_COUT << " - " << std::left << std::setw(name_width) << p.first + ": " << p.second << std::endl; + } +} + #endif debug_configuration::debug_configuration() - : verbose(0) + : help(0) + , verbose(0) , print_multi_kernel_perf(0) , disable_usm(0) , dump_graphs(std::string()) @@ -110,8 +141,10 @@ debug_configuration::debug_configuration() , dump_layers_dst_only(0) , dry_run_path(std::string()) , disable_onednn(0) + , dump_layers_limit_batch(std::numeric_limits::max()) , base_batch_for_memory_estimation(-1) { #ifdef GPU_DEBUG_CONFIG + get_gpu_debug_env_var("Help", help); get_common_debug_env_var("Verbose", verbose); get_gpu_debug_env_var("PrintMultiKernelPerf", print_multi_kernel_perf); get_gpu_debug_env_var("DisableUsm", disable_usm); @@ -120,10 +153,16 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("DumpLayersPath", dump_layers_path); get_gpu_debug_env_var("DumpLayers", dump_layers); get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only); + get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch); get_gpu_debug_env_var("DisableOnednn", disable_onednn); get_gpu_debug_env_var("DryRunPath", dry_run_path); get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation); + if (help > 0) { + print_help_messages(); + exit(0); + } + if (dump_layers.length() > 0) dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used #endif diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp index f579819ac9f..0a1ba4156b7 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp @@ -436,10 +436,5 @@ void graph_initializations::run(program& p) { } set_outputs(p); p.get_processing_order().calc_processing_order(p); - - for (auto& node : p.get_processing_order()) { - if (!node->is_type()) - node->get_output_layout(); - } } } // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp index febc8d61bce..29210058d6b 100644 --- a/inference-engine/thirdparty/clDNN/src/network.cpp +++ b/inference-engine/thirdparty/clDNN/src/network.cpp @@ -110,8 +110,18 @@ template static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { auto&& size = mem->get_layout().size; - file_stream << "shape: " << size.to_string() << " "; - file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl; + GPU_DEBUG_GET_INSTANCE(debug_config); + auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1); + tensor tmp_size(size); + tmp_size.batch[0] = batch_size; + if (tmp_size == size) { + file_stream << "shape: " << size.to_string() << " "; + file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl; + } else { + file_stream << "shape: " << tmp_size.to_string() << " "; + file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) + << ", original shape: " << size.to_string() << ")" << std::endl; + } mem_lock lock(mem, stream); auto mem_ptr = lock.data(); @@ -119,7 +129,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { std::stringstream buffer; for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) { - for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) { + for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) { for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) { for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) { for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) { diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp index ebdc3f3920e..d4cb50f41c5 100644 --- a/inference-engine/thirdparty/clDNN/src/program.cpp +++ b/inference-engine/thirdparty/clDNN/src/program.cpp @@ -1225,8 +1225,13 @@ program::primitives_info program::get_current_stage_info() const { void program::save_pass_info(std::string pass_name) { // TODO: Directory path here can be probably changed to some bool flag - if (!options.get()->directory_path.empty()) + if (!options.get()->directory_path.empty()) { + for (auto& node : this->get_processing_order()) { + if (!node->is_type()) + node->get_output_layout(); + } optimizer_passes_info.emplace_back(pass_name, get_current_stage_info()); + } } void program::add_optimized_primitive_info(primitive_id optimized_primitive_id, diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp index 4801fb9a93f..2fc215403c3 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp @@ -557,6 +557,13 @@ void InferRequest::enqueue() { internal_outputs.clear(); internal_outputs = m_graph->GetNetwork()->execute(dependencies); + + // If dump layers path is set, only runs first inference. + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) { + GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl; + exit(0); + } } void InferRequest::wait_notify() {