[GPU] Add DumpLayersLimitBatch config to GPU debug utils. (#9196)
* Add DumpLayersLimitBatch config to GPU debug utils. + Support OV_GPU_Help config + Only run first inference if OV_GPU_DumpLayersPath is set. + Fix dump graph bug. * Apply some comments * Remove unnecessary code.
This commit is contained in:
parent
abcd7486a9
commit
a4518ae595
@ -24,6 +24,7 @@ private:
|
||||
debug_configuration();
|
||||
public:
|
||||
static const char *prefix;
|
||||
int help; // Print help messages
|
||||
int verbose; // Verbose execution
|
||||
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
|
||||
int disable_usm; // Disable usm usage
|
||||
@ -34,6 +35,7 @@ public:
|
||||
std::string dump_layers; // Dump intermediate buffers of specified layers only, separated by space
|
||||
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
|
||||
int dump_layers_dst_only; // Dump only output of layers
|
||||
int dump_layers_limit_batch; // Limit the size of batch to dump
|
||||
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
||||
static const debug_configuration *get_instance();
|
||||
};
|
||||
|
@ -3,7 +3,9 @@
|
||||
//
|
||||
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
@ -97,10 +99,39 @@ void get_common_debug_env_var(const std::string &var, T &val) {
|
||||
return get_debug_env_var(var, val, allowed_option_prefixes);
|
||||
}
|
||||
|
||||
static void print_help_messages() {
|
||||
std::vector<std::pair<std::string, std::string>> message_list;
|
||||
message_list.emplace_back("OV_GPU_Help", "Print help messages");
|
||||
message_list.emplace_back("OV_GPU_Verbose", "Verbose execution");
|
||||
message_list.emplace_back("OV_GPU_PrintMultiKernelPerf", "Print execution time of each kernel in multi-kernel primitimive");
|
||||
message_list.emplace_back("OV_GPU_DisableUsm", "Disable usm usage");
|
||||
message_list.emplace_back("OV_GPU_DisableOnednn", "Disable onednn for discrete GPU (no effect for integrated GPU)");
|
||||
message_list.emplace_back("OV_GPU_DumpGraphs", "Dump optimized graph");
|
||||
message_list.emplace_back("OV_GPU_DumpSources", "Dump opencl sources");
|
||||
message_list.emplace_back("OV_GPU_DumpLayersPath", "Enable dumping intermediate buffers and set the dest path");
|
||||
message_list.emplace_back("OV_GPU_DumpLayers", "Dump intermediate buffers of specified layers only, separated by space");
|
||||
message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
|
||||
message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
|
||||
message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
|
||||
message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
|
||||
|
||||
auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
|
||||
[](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
|
||||
return a.first.size() < b.first.size();
|
||||
});
|
||||
int name_width = static_cast<int>(max_name_length_item->first.size()) + 2;
|
||||
|
||||
GPU_DEBUG_COUT << "Supported environment variables for debugging" << std::endl;
|
||||
for (auto& p : message_list) {
|
||||
GPU_DEBUG_COUT << " - " << std::left << std::setw(name_width) << p.first + ": " << p.second << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
debug_configuration::debug_configuration()
|
||||
: verbose(0)
|
||||
: help(0)
|
||||
, verbose(0)
|
||||
, print_multi_kernel_perf(0)
|
||||
, disable_usm(0)
|
||||
, dump_graphs(std::string())
|
||||
@ -110,8 +141,10 @@ debug_configuration::debug_configuration()
|
||||
, dump_layers_dst_only(0)
|
||||
, dry_run_path(std::string())
|
||||
, disable_onednn(0)
|
||||
, dump_layers_limit_batch(std::numeric_limits<int>::max())
|
||||
, base_batch_for_memory_estimation(-1) {
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
get_gpu_debug_env_var("Help", help);
|
||||
get_common_debug_env_var("Verbose", verbose);
|
||||
get_gpu_debug_env_var("PrintMultiKernelPerf", print_multi_kernel_perf);
|
||||
get_gpu_debug_env_var("DisableUsm", disable_usm);
|
||||
@ -120,10 +153,16 @@ debug_configuration::debug_configuration()
|
||||
get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
|
||||
get_gpu_debug_env_var("DumpLayers", dump_layers);
|
||||
get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
|
||||
get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
|
||||
get_gpu_debug_env_var("DisableOnednn", disable_onednn);
|
||||
get_gpu_debug_env_var("DryRunPath", dry_run_path);
|
||||
get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation);
|
||||
|
||||
if (help > 0) {
|
||||
print_help_messages();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (dump_layers.length() > 0)
|
||||
dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used
|
||||
#endif
|
||||
|
@ -436,10 +436,5 @@ void graph_initializations::run(program& p) {
|
||||
}
|
||||
set_outputs(p);
|
||||
p.get_processing_order().calc_processing_order(p);
|
||||
|
||||
for (auto& node : p.get_processing_order()) {
|
||||
if (!node->is_type<data>())
|
||||
node->get_output_layout();
|
||||
}
|
||||
}
|
||||
} // namespace cldnn
|
||||
|
@ -110,8 +110,18 @@ template <class T>
|
||||
static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
|
||||
auto&& size = mem->get_layout().size;
|
||||
|
||||
file_stream << "shape: " << size.to_string() << " ";
|
||||
file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
|
||||
tensor tmp_size(size);
|
||||
tmp_size.batch[0] = batch_size;
|
||||
if (tmp_size == size) {
|
||||
file_stream << "shape: " << size.to_string() << " ";
|
||||
file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
|
||||
} else {
|
||||
file_stream << "shape: " << tmp_size.to_string() << " ";
|
||||
file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
|
||||
<< ", original shape: " << size.to_string() << ")" << std::endl;
|
||||
}
|
||||
|
||||
mem_lock<T, mem_lock_type::read> lock(mem, stream);
|
||||
auto mem_ptr = lock.data();
|
||||
@ -119,7 +129,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
|
||||
std::stringstream buffer;
|
||||
|
||||
for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
|
||||
for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) {
|
||||
for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
|
||||
for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
|
||||
for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
|
||||
for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
|
||||
|
@ -1225,8 +1225,13 @@ program::primitives_info program::get_current_stage_info() const {
|
||||
|
||||
void program::save_pass_info(std::string pass_name) {
|
||||
// TODO: Directory path here can be probably changed to some bool flag
|
||||
if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty())
|
||||
if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty()) {
|
||||
for (auto& node : this->get_processing_order()) {
|
||||
if (!node->is_type<data>())
|
||||
node->get_output_layout();
|
||||
}
|
||||
optimizer_passes_info.emplace_back(pass_name, get_current_stage_info());
|
||||
}
|
||||
}
|
||||
|
||||
void program::add_optimized_primitive_info(primitive_id optimized_primitive_id,
|
||||
|
@ -557,6 +557,13 @@ void InferRequest::enqueue() {
|
||||
|
||||
internal_outputs.clear();
|
||||
internal_outputs = m_graph->GetNetwork()->execute(dependencies);
|
||||
|
||||
// If dump layers path is set, only runs first inference.
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
|
||||
GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
void InferRequest::wait_notify() {
|
||||
|
Loading…
Reference in New Issue
Block a user