[GPU] Add DumpLayersLimitBatch config to GPU debug utils. (#9196)

* Add DumpLayersLimitBatch config to GPU debug utils.

+ Support OV_GPU_Help config
+ Only run first inference if OV_GPU_DumpLayersPath is set.
+ Fix dump graph bug.

* Apply some comments

* Remove unnecessary code.
This commit is contained in:
Jade Cho 2021-12-20 14:19:53 +09:00 committed by GitHub
parent abcd7486a9
commit a4518ae595
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 68 additions and 10 deletions

View File

@ -24,6 +24,7 @@ private:
debug_configuration(); debug_configuration();
public: public:
static const char *prefix; static const char *prefix;
int help; // Print help messages
int verbose; // Verbose execution int verbose; // Verbose execution
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
int disable_usm; // Disable usm usage int disable_usm; // Disable usm usage
@ -34,6 +35,7 @@ public:
std::string dump_layers; // Dump intermediate buffers of specified layers only, separated by space std::string dump_layers; // Dump intermediate buffers of specified layers only, separated by space
std::string dry_run_path; // Dry run and serialize execution graph into the specified path std::string dry_run_path; // Dry run and serialize execution graph into the specified path
int dump_layers_dst_only; // Dump only output of layers int dump_layers_dst_only; // Dump only output of layers
int dump_layers_limit_batch; // Limit the size of batch to dump
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
static const debug_configuration *get_instance(); static const debug_configuration *get_instance();
}; };

View File

@ -3,7 +3,9 @@
// //
#include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/debug_configuration.hpp"
#include <algorithm>
#include <iostream> #include <iostream>
#include <iomanip>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <sstream> #include <sstream>
@ -97,10 +99,39 @@ void get_common_debug_env_var(const std::string &var, T &val) {
return get_debug_env_var(var, val, allowed_option_prefixes); return get_debug_env_var(var, val, allowed_option_prefixes);
} }
static void print_help_messages() {
std::vector<std::pair<std::string, std::string>> message_list;
message_list.emplace_back("OV_GPU_Help", "Print help messages");
message_list.emplace_back("OV_GPU_Verbose", "Verbose execution");
message_list.emplace_back("OV_GPU_PrintMultiKernelPerf", "Print execution time of each kernel in multi-kernel primitimive");
message_list.emplace_back("OV_GPU_DisableUsm", "Disable usm usage");
message_list.emplace_back("OV_GPU_DisableOnednn", "Disable onednn for discrete GPU (no effect for integrated GPU)");
message_list.emplace_back("OV_GPU_DumpGraphs", "Dump optimized graph");
message_list.emplace_back("OV_GPU_DumpSources", "Dump opencl sources");
message_list.emplace_back("OV_GPU_DumpLayersPath", "Enable dumping intermediate buffers and set the dest path");
message_list.emplace_back("OV_GPU_DumpLayers", "Dump intermediate buffers of specified layers only, separated by space");
message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
[](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
return a.first.size() < b.first.size();
});
int name_width = static_cast<int>(max_name_length_item->first.size()) + 2;
GPU_DEBUG_COUT << "Supported environment variables for debugging" << std::endl;
for (auto& p : message_list) {
GPU_DEBUG_COUT << " - " << std::left << std::setw(name_width) << p.first + ": " << p.second << std::endl;
}
}
#endif #endif
debug_configuration::debug_configuration() debug_configuration::debug_configuration()
: verbose(0) : help(0)
, verbose(0)
, print_multi_kernel_perf(0) , print_multi_kernel_perf(0)
, disable_usm(0) , disable_usm(0)
, dump_graphs(std::string()) , dump_graphs(std::string())
@ -110,8 +141,10 @@ debug_configuration::debug_configuration()
, dump_layers_dst_only(0) , dump_layers_dst_only(0)
, dry_run_path(std::string()) , dry_run_path(std::string())
, disable_onednn(0) , disable_onednn(0)
, dump_layers_limit_batch(std::numeric_limits<int>::max())
, base_batch_for_memory_estimation(-1) { , base_batch_for_memory_estimation(-1) {
#ifdef GPU_DEBUG_CONFIG #ifdef GPU_DEBUG_CONFIG
get_gpu_debug_env_var("Help", help);
get_common_debug_env_var("Verbose", verbose); get_common_debug_env_var("Verbose", verbose);
get_gpu_debug_env_var("PrintMultiKernelPerf", print_multi_kernel_perf); get_gpu_debug_env_var("PrintMultiKernelPerf", print_multi_kernel_perf);
get_gpu_debug_env_var("DisableUsm", disable_usm); get_gpu_debug_env_var("DisableUsm", disable_usm);
@ -120,10 +153,16 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("DumpLayersPath", dump_layers_path); get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
get_gpu_debug_env_var("DumpLayers", dump_layers); get_gpu_debug_env_var("DumpLayers", dump_layers);
get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only); get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
get_gpu_debug_env_var("DisableOnednn", disable_onednn); get_gpu_debug_env_var("DisableOnednn", disable_onednn);
get_gpu_debug_env_var("DryRunPath", dry_run_path); get_gpu_debug_env_var("DryRunPath", dry_run_path);
get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation); get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation);
if (help > 0) {
print_help_messages();
exit(0);
}
if (dump_layers.length() > 0) if (dump_layers.length() > 0)
dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used
#endif #endif

View File

@ -436,10 +436,5 @@ void graph_initializations::run(program& p) {
} }
set_outputs(p); set_outputs(p);
p.get_processing_order().calc_processing_order(p); p.get_processing_order().calc_processing_order(p);
for (auto& node : p.get_processing_order()) {
if (!node->is_type<data>())
node->get_output_layout();
}
} }
} // namespace cldnn } // namespace cldnn

View File

@ -110,8 +110,18 @@ template <class T>
static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
auto&& size = mem->get_layout().size; auto&& size = mem->get_layout().size;
file_stream << "shape: " << size.to_string() << " "; GPU_DEBUG_GET_INSTANCE(debug_config);
file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl; auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
tensor tmp_size(size);
tmp_size.batch[0] = batch_size;
if (tmp_size == size) {
file_stream << "shape: " << size.to_string() << " ";
file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
} else {
file_stream << "shape: " << tmp_size.to_string() << " ";
file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
<< ", original shape: " << size.to_string() << ")" << std::endl;
}
mem_lock<T, mem_lock_type::read> lock(mem, stream); mem_lock<T, mem_lock_type::read> lock(mem, stream);
auto mem_ptr = lock.data(); auto mem_ptr = lock.data();
@ -119,7 +129,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
std::stringstream buffer; std::stringstream buffer;
for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) { for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) { for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) { for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) { for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) { for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {

View File

@ -1225,8 +1225,13 @@ program::primitives_info program::get_current_stage_info() const {
void program::save_pass_info(std::string pass_name) { void program::save_pass_info(std::string pass_name) {
// TODO: Directory path here can be probably changed to some bool flag // TODO: Directory path here can be probably changed to some bool flag
if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty()) if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty()) {
for (auto& node : this->get_processing_order()) {
if (!node->is_type<data>())
node->get_output_layout();
}
optimizer_passes_info.emplace_back(pass_name, get_current_stage_info()); optimizer_passes_info.emplace_back(pass_name, get_current_stage_info());
}
} }
void program::add_optimized_primitive_info(primitive_id optimized_primitive_id, void program::add_optimized_primitive_info(primitive_id optimized_primitive_id,

View File

@ -557,6 +557,13 @@ void InferRequest::enqueue() {
internal_outputs.clear(); internal_outputs.clear();
internal_outputs = m_graph->GetNetwork()->execute(dependencies); internal_outputs = m_graph->GetNetwork()->execute(dependencies);
// If dump layers path is set, only runs first inference.
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
exit(0);
}
} }
void InferRequest::wait_notify() { void InferRequest::wait_notify() {