[GPU] Improve OV_GPU_DumpLayers debug configuration (#15719)

Co-authored-by: Kim,SungEun <sungeun.kim@intel.com>
This commit is contained in:
Dohyun Kim (Felix) 2023-02-19 23:57:19 +09:00 committed by GitHub
parent 1d5839fb92
commit b7bcef6864
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 65 additions and 38 deletions

View File

@ -93,6 +93,7 @@ public:
int dump_layers_dst_only; // Dump only output of layers int dump_layers_dst_only; // Dump only output of layers
int dump_layers_result; // Dump result layers int dump_layers_result; // Dump result layers
int dump_layers_limit_batch; // Limit the size of batch to dump int dump_layers_limit_batch; // Limit the size of batch to dump
int dump_layers_raw; // Dump raw data.
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
std::vector<std::string> after_proc; // Start inference after the listed processes std::vector<std::string> after_proc; // Start inference after the listed processes
int serialize_compile; // Serialize creating primitives and compiling kernels int serialize_compile; // Serialize creating primitives and compiling kernels

View File

@ -146,7 +146,7 @@ size_t get_x_pitch(const layout& layout) {
} }
template <class T> template <class T>
void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) { void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
auto&& size = mem->get_layout().get_tensor(); auto&& size = mem->get_layout().get_tensor();
GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_GET_INSTANCE(debug_config);
@ -155,11 +155,15 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
tmp_size.batch[0] = batch_size; tmp_size.batch[0] = batch_size;
if (tmp_size == size) { if (tmp_size == size) {
file_stream << "shape: " << size.to_string() << " "; file_stream << "shape: " << size.to_string() << " ";
file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl; file_stream << "(count: " << size.count()
<< ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
<< (dump_raw ? " raw data" : "") << std::endl;
} else { } else {
file_stream << "shape: " << tmp_size.to_string() << " "; file_stream << "shape: " << tmp_size.to_string() << " ";
file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) file_stream << "(count: " << tmp_size.count()
<< ", original shape: " << size.to_string() << ")" << std::endl; << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
<< ", original shape: " << size.to_string() << ")"
<< (dump_raw ? " raw data" : "") << std::endl;
} }
if (size.count() == 0) { if (size.count() == 0) {
@ -172,29 +176,35 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
auto x_pitch = get_x_pitch(mem->get_layout()); auto x_pitch = get_x_pitch(mem->get_layout());
std::stringstream buffer; std::stringstream buffer;
for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) { if (!dump_raw) {
for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) { for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) { for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) { for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) { for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) { for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w)); for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
size_t input_it = mem->get_layout().get_linear_offset(t); cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
size_t input_it = mem->get_layout().get_linear_offset(t);
for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) { for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl; buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
}
} }
} }
} }
} }
} }
} }
} else {
for (size_t i = 0; i < lock.size(); ++i) {
buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl;
}
} }
file_stream << buffer.str(); file_stream << buffer.str();
} }
template <> template <>
void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream) { void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
auto&& l = mem->get_layout(); auto&& l = mem->get_layout();
file_stream << "shape: "; file_stream << "shape: ";
@ -207,23 +217,29 @@ void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream)
mem_lock<uint32_t, mem_lock_type::read> lock(mem, stream); mem_lock<uint32_t, mem_lock_type::read> lock(mem, stream);
auto mem_ptr = lock.data(); auto mem_ptr = lock.data();
for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) { if (!dump_raw) {
for (cldnn::tensor::value_type f = 0; f < (cldnn::tensor::value_type)ceil_div(l.feature(), 32); ++f) { for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) {
for (cldnn::tensor::value_type z = 0; z < l.spatial(2); ++z) { for (cldnn::tensor::value_type f = 0; f < (cldnn::tensor::value_type)ceil_div(l.feature(), 32); ++f) {
for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) { for (cldnn::tensor::value_type z = 0; z < l.spatial(2); ++z) {
for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) { for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) {
cldnn::tensor t(cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, 0)); for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) {
size_t input_it = mem->get_layout().get_linear_offset(t); cldnn::tensor t(cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, 0));
file_stream << mem_ptr[input_it] << std::endl; size_t input_it = mem->get_layout().get_linear_offset(t);
file_stream << mem_ptr[input_it] << std::endl;
}
} }
} }
} }
} }
} else {
for (size_t i = 0; i < lock.size(); ++i) {
file_stream << std::fixed << std::setprecision(6) << mem_ptr[i] << std::endl;
}
} }
} }
void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) { void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName, bool dump_raw) {
std::cout << "Dump " << layerName << std::endl; std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl;
GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_GET_INSTANCE(debug_config);
std::string filename = layerName; std::string filename = layerName;
std::replace(filename.begin(), filename.end(), '\\', '_'); std::replace(filename.begin(), filename.end(), '\\', '_');
@ -239,17 +255,17 @@ void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName)
auto mem_dt = mem->get_layout().data_type; auto mem_dt = mem->get_layout().data_type;
if (mem_dt == cldnn::data_types::f32) if (mem_dt == cldnn::data_types::f32)
dump<float>(mem, stream, file_stream); dump<float>(mem, stream, file_stream, dump_raw);
else if (mem_dt == cldnn::data_types::f16) else if (mem_dt == cldnn::data_types::f16)
dump<half_t>(mem, stream, file_stream); dump<half_t>(mem, stream, file_stream, dump_raw);
else if (mem_dt == cldnn::data_types::bin) else if (mem_dt == cldnn::data_types::bin)
dump<uint32_t>(mem, stream, file_stream); dump<uint32_t>(mem, stream, file_stream, dump_raw);
else if (mem_dt == cldnn::data_types::i32) else if (mem_dt == cldnn::data_types::i32)
dump<int32_t>(mem, stream, file_stream); dump<int32_t>(mem, stream, file_stream, dump_raw);
else if (mem_dt == cldnn::data_types::i8) else if (mem_dt == cldnn::data_types::i8)
dump<int8_t>(mem, stream, file_stream); dump<int8_t>(mem, stream, file_stream, dump_raw);
else if (mem_dt == cldnn::data_types::u8) else if (mem_dt == cldnn::data_types::u8)
dump<uint8_t>(mem, stream, file_stream); dump<uint8_t>(mem, stream, file_stream, dump_raw);
} }
void wait_for_the_turn() { void wait_for_the_turn() {
@ -272,7 +288,7 @@ void wait_for_the_turn() {
#else #else
void dump_perf_data_raw(std::string, const std::list<std::shared_ptr<primitive_inst>>&) {} void dump_perf_data_raw(std::string, const std::list<std::shared_ptr<primitive_inst>>&) {}
void log_memory_to_file(memory::ptr, stream&, std::string) {} void log_memory_to_file(memory::ptr, stream&, std::string, bool dump_raw) {}
void wait_for_the_turn() {} void wait_for_the_turn() {}
#endif #endif
} // namespace } // namespace
@ -988,11 +1004,14 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
std::cerr << inst->id() << std::endl; std::cerr << inst->id() << std::endl;
} }
GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 && GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 && debug_config->is_dumped_layer(layer_name)) {
debug_config->is_dumped_layer(layer_name)) {
for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) { for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i), get_stream(), log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i),
layer_name + "_src_" + std::to_string(i)); get_stream(),
"program" + std::to_string(get_program()->get_id()) +
"_network" + std::to_string(get_id()) +
"_" + layer_name + "_src" + std::to_string(i),
debug_config->dump_layers_raw);
} }
} }
} }
@ -1004,8 +1023,12 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
const std::string layer_name = inst->id(); const std::string layer_name = inst->id();
GPU_DEBUG_IF(debug_config->is_dumped_layer(layer_name, inst->is_output())) { GPU_DEBUG_IF(debug_config->is_dumped_layer(layer_name, inst->is_output())) {
for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) { for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(i), get_stream(), log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(i),
layer_name + "_dst_" + std::to_string(i)); get_stream(),
"program" + std::to_string(get_program()->get_id()) +
"_network" + std::to_string(get_id()) +
"_" + layer_name + "_dst" + std::to_string(i),
debug_config->dump_layers_raw);
} }
} }
} }

View File

@ -117,6 +117,7 @@ static void print_help_messages() {
message_list.emplace_back("OV_GPU_DumpLayersResult", "Dump output buffers of result layers only"); message_list.emplace_back("OV_GPU_DumpLayersResult", "Dump output buffers of result layers only");
message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers"); message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump"); message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
message_list.emplace_back("OV_GPU_DumpLayersRaw", "If true, dump data is stored in raw memory format.");
message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path"); message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation"); message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space." message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space."
@ -156,6 +157,7 @@ debug_configuration::debug_configuration()
, dump_layers_dst_only(0) , dump_layers_dst_only(0)
, dump_layers_result(0) , dump_layers_result(0)
, dump_layers_limit_batch(std::numeric_limits<int>::max()) , dump_layers_limit_batch(std::numeric_limits<int>::max())
, dump_layers_raw(0)
, base_batch_for_memory_estimation(-1) , base_batch_for_memory_estimation(-1)
, serialize_compile(0) , serialize_compile(0)
, max_kernels_per_batch(0) { , max_kernels_per_batch(0) {
@ -168,6 +170,7 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("DumpSources", dump_sources); get_gpu_debug_env_var("DumpSources", dump_sources);
get_gpu_debug_env_var("DumpLayersPath", dump_layers_path); get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch); get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
get_gpu_debug_env_var("DumpLayersRaw", dump_layers_raw);
get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only); get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
get_gpu_debug_env_var("DumpLayersResult", dump_layers_result); get_gpu_debug_env_var("DumpLayersResult", dump_layers_result);
get_gpu_debug_env_var("DisableOnednn", disable_onednn); get_gpu_debug_env_var("DisableOnednn", disable_onednn);