[GPU] Improve OV_GPU_DumpLayers debug configuration (#15719)
Co-authored-by: Kim,SungEun <sungeun.kim@intel.com>
This commit is contained in:
parent
1d5839fb92
commit
b7bcef6864
@ -93,6 +93,7 @@ public:
|
|||||||
int dump_layers_dst_only; // Dump only output of layers
|
int dump_layers_dst_only; // Dump only output of layers
|
||||||
int dump_layers_result; // Dump result layers
|
int dump_layers_result; // Dump result layers
|
||||||
int dump_layers_limit_batch; // Limit the size of batch to dump
|
int dump_layers_limit_batch; // Limit the size of batch to dump
|
||||||
|
int dump_layers_raw; // Dump raw data.
|
||||||
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
||||||
std::vector<std::string> after_proc; // Start inference after the listed processes
|
std::vector<std::string> after_proc; // Start inference after the listed processes
|
||||||
int serialize_compile; // Serialize creating primitives and compiling kernels
|
int serialize_compile; // Serialize creating primitives and compiling kernels
|
||||||
|
@ -146,7 +146,7 @@ size_t get_x_pitch(const layout& layout) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
|
void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
|
||||||
auto&& size = mem->get_layout().get_tensor();
|
auto&& size = mem->get_layout().get_tensor();
|
||||||
|
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
@ -155,11 +155,15 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
|
|||||||
tmp_size.batch[0] = batch_size;
|
tmp_size.batch[0] = batch_size;
|
||||||
if (tmp_size == size) {
|
if (tmp_size == size) {
|
||||||
file_stream << "shape: " << size.to_string() << " ";
|
file_stream << "shape: " << size.to_string() << " ";
|
||||||
file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
|
file_stream << "(count: " << size.count()
|
||||||
|
<< ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
|
||||||
|
<< (dump_raw ? " raw data" : "") << std::endl;
|
||||||
} else {
|
} else {
|
||||||
file_stream << "shape: " << tmp_size.to_string() << " ";
|
file_stream << "shape: " << tmp_size.to_string() << " ";
|
||||||
file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
|
file_stream << "(count: " << tmp_size.count()
|
||||||
<< ", original shape: " << size.to_string() << ")" << std::endl;
|
<< ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
|
||||||
|
<< ", original shape: " << size.to_string() << ")"
|
||||||
|
<< (dump_raw ? " raw data" : "") << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (size.count() == 0) {
|
if (size.count() == 0) {
|
||||||
@ -172,29 +176,35 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
|
|||||||
auto x_pitch = get_x_pitch(mem->get_layout());
|
auto x_pitch = get_x_pitch(mem->get_layout());
|
||||||
std::stringstream buffer;
|
std::stringstream buffer;
|
||||||
|
|
||||||
for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
|
if (!dump_raw) {
|
||||||
for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
|
for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
|
||||||
for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
|
for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
|
||||||
for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
|
for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
|
||||||
for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
|
for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
|
||||||
for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
|
for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
|
||||||
cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
|
for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
|
||||||
size_t input_it = mem->get_layout().get_linear_offset(t);
|
cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
|
||||||
|
size_t input_it = mem->get_layout().get_linear_offset(t);
|
||||||
|
|
||||||
for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
|
for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
|
||||||
buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
|
buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < lock.size(); ++i) {
|
||||||
|
buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
file_stream << buffer.str();
|
file_stream << buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
|
void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
|
||||||
auto&& l = mem->get_layout();
|
auto&& l = mem->get_layout();
|
||||||
|
|
||||||
file_stream << "shape: ";
|
file_stream << "shape: ";
|
||||||
@ -207,23 +217,29 @@ void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream)
|
|||||||
mem_lock<uint32_t, mem_lock_type::read> lock(mem, stream);
|
mem_lock<uint32_t, mem_lock_type::read> lock(mem, stream);
|
||||||
auto mem_ptr = lock.data();
|
auto mem_ptr = lock.data();
|
||||||
|
|
||||||
for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) {
|
if (!dump_raw) {
|
||||||
for (cldnn::tensor::value_type f = 0; f < (cldnn::tensor::value_type)ceil_div(l.feature(), 32); ++f) {
|
for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) {
|
||||||
for (cldnn::tensor::value_type z = 0; z < l.spatial(2); ++z) {
|
for (cldnn::tensor::value_type f = 0; f < (cldnn::tensor::value_type)ceil_div(l.feature(), 32); ++f) {
|
||||||
for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) {
|
for (cldnn::tensor::value_type z = 0; z < l.spatial(2); ++z) {
|
||||||
for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) {
|
for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) {
|
||||||
cldnn::tensor t(cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, 0));
|
for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) {
|
||||||
size_t input_it = mem->get_layout().get_linear_offset(t);
|
cldnn::tensor t(cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, 0));
|
||||||
file_stream << mem_ptr[input_it] << std::endl;
|
size_t input_it = mem->get_layout().get_linear_offset(t);
|
||||||
|
file_stream << mem_ptr[input_it] << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < lock.size(); ++i) {
|
||||||
|
file_stream << std::fixed << std::setprecision(6) << mem_ptr[i] << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) {
|
void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName, bool dump_raw) {
|
||||||
std::cout << "Dump " << layerName << std::endl;
|
std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl;
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
std::string filename = layerName;
|
std::string filename = layerName;
|
||||||
std::replace(filename.begin(), filename.end(), '\\', '_');
|
std::replace(filename.begin(), filename.end(), '\\', '_');
|
||||||
@ -239,17 +255,17 @@ void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName)
|
|||||||
|
|
||||||
auto mem_dt = mem->get_layout().data_type;
|
auto mem_dt = mem->get_layout().data_type;
|
||||||
if (mem_dt == cldnn::data_types::f32)
|
if (mem_dt == cldnn::data_types::f32)
|
||||||
dump<float>(mem, stream, file_stream);
|
dump<float>(mem, stream, file_stream, dump_raw);
|
||||||
else if (mem_dt == cldnn::data_types::f16)
|
else if (mem_dt == cldnn::data_types::f16)
|
||||||
dump<half_t>(mem, stream, file_stream);
|
dump<half_t>(mem, stream, file_stream, dump_raw);
|
||||||
else if (mem_dt == cldnn::data_types::bin)
|
else if (mem_dt == cldnn::data_types::bin)
|
||||||
dump<uint32_t>(mem, stream, file_stream);
|
dump<uint32_t>(mem, stream, file_stream, dump_raw);
|
||||||
else if (mem_dt == cldnn::data_types::i32)
|
else if (mem_dt == cldnn::data_types::i32)
|
||||||
dump<int32_t>(mem, stream, file_stream);
|
dump<int32_t>(mem, stream, file_stream, dump_raw);
|
||||||
else if (mem_dt == cldnn::data_types::i8)
|
else if (mem_dt == cldnn::data_types::i8)
|
||||||
dump<int8_t>(mem, stream, file_stream);
|
dump<int8_t>(mem, stream, file_stream, dump_raw);
|
||||||
else if (mem_dt == cldnn::data_types::u8)
|
else if (mem_dt == cldnn::data_types::u8)
|
||||||
dump<uint8_t>(mem, stream, file_stream);
|
dump<uint8_t>(mem, stream, file_stream, dump_raw);
|
||||||
}
|
}
|
||||||
|
|
||||||
void wait_for_the_turn() {
|
void wait_for_the_turn() {
|
||||||
@ -272,7 +288,7 @@ void wait_for_the_turn() {
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
void dump_perf_data_raw(std::string, const std::list<std::shared_ptr<primitive_inst>>&) {}
|
void dump_perf_data_raw(std::string, const std::list<std::shared_ptr<primitive_inst>>&) {}
|
||||||
void log_memory_to_file(memory::ptr, stream&, std::string) {}
|
void log_memory_to_file(memory::ptr, stream&, std::string, bool dump_raw) {}
|
||||||
void wait_for_the_turn() {}
|
void wait_for_the_turn() {}
|
||||||
#endif
|
#endif
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -988,11 +1004,14 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
|
|||||||
std::cerr << inst->id() << std::endl;
|
std::cerr << inst->id() << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 &&
|
GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 && debug_config->is_dumped_layer(layer_name)) {
|
||||||
debug_config->is_dumped_layer(layer_name)) {
|
|
||||||
for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
|
for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
|
||||||
log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i), get_stream(),
|
log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i),
|
||||||
layer_name + "_src_" + std::to_string(i));
|
get_stream(),
|
||||||
|
"program" + std::to_string(get_program()->get_id()) +
|
||||||
|
"_network" + std::to_string(get_id()) +
|
||||||
|
"_" + layer_name + "_src" + std::to_string(i),
|
||||||
|
debug_config->dump_layers_raw);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1004,8 +1023,12 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
|
|||||||
const std::string layer_name = inst->id();
|
const std::string layer_name = inst->id();
|
||||||
GPU_DEBUG_IF(debug_config->is_dumped_layer(layer_name, inst->is_output())) {
|
GPU_DEBUG_IF(debug_config->is_dumped_layer(layer_name, inst->is_output())) {
|
||||||
for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
|
for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
|
||||||
log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(i), get_stream(),
|
log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(i),
|
||||||
layer_name + "_dst_" + std::to_string(i));
|
get_stream(),
|
||||||
|
"program" + std::to_string(get_program()->get_id()) +
|
||||||
|
"_network" + std::to_string(get_id()) +
|
||||||
|
"_" + layer_name + "_dst" + std::to_string(i),
|
||||||
|
debug_config->dump_layers_raw);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -117,6 +117,7 @@ static void print_help_messages() {
|
|||||||
message_list.emplace_back("OV_GPU_DumpLayersResult", "Dump output buffers of result layers only");
|
message_list.emplace_back("OV_GPU_DumpLayersResult", "Dump output buffers of result layers only");
|
||||||
message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
|
message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
|
||||||
message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
|
message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
|
||||||
|
message_list.emplace_back("OV_GPU_DumpLayersRaw", "If true, dump data is stored in raw memory format.");
|
||||||
message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
|
message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
|
||||||
message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
|
message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
|
||||||
message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space."
|
message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space."
|
||||||
@ -156,6 +157,7 @@ debug_configuration::debug_configuration()
|
|||||||
, dump_layers_dst_only(0)
|
, dump_layers_dst_only(0)
|
||||||
, dump_layers_result(0)
|
, dump_layers_result(0)
|
||||||
, dump_layers_limit_batch(std::numeric_limits<int>::max())
|
, dump_layers_limit_batch(std::numeric_limits<int>::max())
|
||||||
|
, dump_layers_raw(0)
|
||||||
, base_batch_for_memory_estimation(-1)
|
, base_batch_for_memory_estimation(-1)
|
||||||
, serialize_compile(0)
|
, serialize_compile(0)
|
||||||
, max_kernels_per_batch(0) {
|
, max_kernels_per_batch(0) {
|
||||||
@ -168,6 +170,7 @@ debug_configuration::debug_configuration()
|
|||||||
get_gpu_debug_env_var("DumpSources", dump_sources);
|
get_gpu_debug_env_var("DumpSources", dump_sources);
|
||||||
get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
|
get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
|
||||||
get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
|
get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
|
||||||
|
get_gpu_debug_env_var("DumpLayersRaw", dump_layers_raw);
|
||||||
get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
|
get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
|
||||||
get_gpu_debug_env_var("DumpLayersResult", dump_layers_result);
|
get_gpu_debug_env_var("DumpLayersResult", dump_layers_result);
|
||||||
get_gpu_debug_env_var("DisableOnednn", disable_onednn);
|
get_gpu_debug_env_var("DisableOnednn", disable_onednn);
|
||||||
|
Loading…
Reference in New Issue
Block a user